#### **Import libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
cmap = cm.get_cmap
import seaborn as sns

%matplotlib inline
import io
pd.set_option('display.max_columns', None)

#### **Check encoding file**

In [None]:
import chardet
with open("D:\Descargas\Stats19-Data1979-2004\Accidents7904.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

print(result)

#### **Import Data**

In [None]:
accidents=pd.read_csv('D:\Descargas\Stats19-Data1979-2004\Accidents7904.csv',delimiter=',',encoding='UTF-8-SIG')

In [None]:
accidents.columns

In [None]:
accidents.sample(5)

In [None]:
accidents.info()

#### **Cleaning and pre process the data**

##### **Checking percentage of missing or NaN values**

In [None]:
print("NaN data: \n \n"," \n ",accidents.isna().sum()/len(accidents),"%")
print("\n Missing or out of range data:\n",np.abs(accidents[accidents==-1].sum())/len(accidents),"%")
#accidents.isna().sum().sum()/len(accidents)

##### **Drop the useless columns**

In [None]:
columns_to_drop = ['Police_Force', 'Local_Authority_(District)', 'Local_Authority_(Highway)', 
             '1st_Road_Number', '2nd_Road_Number', 'Pedestrian_Crossing-Human_Control', 
             'Pedestrian_Crossing-Physical_Facilities','Did_Police_Officer_Attend_Scene_of_Accident',
            'LSOA_of_Accident_Location','Longitude','Latitude','Urban_or_Rural_Area','Junction_Control','2nd_Road_Class','Special_Conditions_at_Site','Carriageway_Hazards','Junction_Detail']

In [None]:
accidents.drop(labels=columns_to_drop,inplace=True,axis=1)

In [None]:
accidents.isna().sum()/len(accidents)

In [None]:
#Values = -1 are missing data or values out of range
# el resto el haremos drop ya que tiene unos porcentajes muy bajos y no nos afectarán al análisis posterior.
print("NaN data: \n \n"," \n ",accidents.isna().sum()/len(accidents),"%")
print("\n Missing or out of range data:\n",np.abs(accidents[accidents==-1].sum())/len(accidents),"%")
#accidents.isna().sum().sum()/len(accidents)

##### **Clean missing values =-1**

In [None]:
#cleaning the missing values = -1
for i,k in enumerate(accidents):
    accidents.drop(index=accidents[accidents[k] == -1].index, inplace=True)

In [None]:
#accidents = accidents.to_csv('Descargas/Stats19-Data1979-2004/accidents_clean.csv',sep=',')

In [None]:
accidents['Date']=pd.to_datetime(accidents['Date'])
accidents['Month']=accidents['Date'].dt.month
accidents['Year']=accidents['Date'].dt.year

In [None]:
accidents['Accident_Index'].count()

In [None]:
accidents['Hour']=  accidents['Time'].str[0:2]
accidents['Hour'] = pd.to_numeric(accidents['Hour'])
accidents = accidents.dropna(subset=['Hour'])
accidents['Hour'] = accidents['Hour'].astype('int')

In [None]:
def daytime(hour):
    if hour >= 5 and hour < 10:
        return "Commuting to work"
    elif hour >= 10 and hour < 15:
        return "Office hours"
    elif hour >= 15 and hour < 19:
        return "Commuting to home"
    elif hour >= 19 and hour < 23:
        return "Evening"
    else:
        return "Night"
accidents['Daytime'] = accidents['Hour'].apply(daytime)
accidents[['Time', 'Hour', 'Daytime']].head(8)


In [None]:
accidents['Daytime'].value_counts()

In [None]:
accidents.columns

In [None]:
accidents.head()
print(accidents.shape)
accidents.columns

In [None]:
accidents.head()

In [None]:
accidents['Accident_Severity'].replace([1,2,3],['Fatal','Serious','Slight'],inplace=True)
accidents.Accident_Severity.value_counts(normalize=True).plot(kind='pie')


In [None]:
days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
accidents['Day_of_Week'].replace([1,2,3,4,5,6,7],days, inplace=True)
accidents.Day_of_Week.value_counts(normalize=True).sort_values(ascending=True).plot(kind='bar',color='grey');

In [None]:
#Put Motorway and A(M) in the same category
accidents['1st_Road_Class'].replace([1,2,3,4,5,6],['Motorway','A(M)','A', 'B', 'C', 'Unclassified'],inplace=True)

accidents['1st_Road_Class'] = accidents['1st_Road_Class'].replace('A(M)', 'Motorway')
accidents['1st_Road_Class'].value_counts(normalize=True).plot(kind='bar',color='gold');

In [None]:
accidents['Road_Type'].replace([1,2,3,6,7,9,12],['Roundabout','One way street','Dual carriageway','Single carriageway','Slip road','Unknown','One way street/Slip road'],inplace=True)
accidents.Road_Type.value_counts(normalize=True).plot(kind='bar',color='gold');

In [None]:
#accidents['Junction_Detail'].replace([0,1,2,3,5,6,7,8,9],['Not at junction or within 20 metres',
#                                                         'Roundabout', 'Mini-roundabout', 'T or staggered junction', 'Slip road','Crossroads',
#                                                          'More than 4 arms (not roundabout)','Private drive or entrance','Other junction'],inplace=True)
#accidents.Junction_Detail.value_counts(normalize=True).plot(kind='bar',color='gold');

In [None]:
accidents.Light_Conditions = accidents.Light_Conditions.replace([1,4,5,6,7], 
                                                      ['Daylight', 
                                                       'Darkness - lights lit', 
                                                       'Darkness - lights unlit', 
                                                       'Darkness - no lighting', 
                                                       'Darkness - lighting unknown'])


accidents.Light_Conditions.value_counts(normalize=True).plot(kind='bar',color='gold');

In [None]:
accidents.Weather_Conditions = accidents.Weather_Conditions.replace([1,2,3,4,5,6,7,8,9], 
                                                                ['Fine no high winds', 
                                                                 'Raining no high winds', 
                                                                 'Snowing no high winds', 
                                                                 'Fine + high winds', 
                                                                 'Raining + high winds', 
                                                                 'Snowing + high winds', 
                                                                 'Fog or mist', 'Other', 'Unknown', 
                                                                 ])
accidents.Weather_Conditions.value_counts(normalize=True).plot(kind='bar',color='gold');

In [None]:
accidents.Road_Surface_Conditions = accidents.Road_Surface_Conditions.replace([1,2,3,4,5,6,7], 
                                                                                    ['Dry', 
                                                                                     'Wet or damp', 
                                                                                     'Snow', 
                                                                                     'Frost or ice', 
                                                                                     'Flood over 3cm. deep',
                                                                                     'Oil or diesel',
                                                                                     'Mud'])
accidents.Road_Surface_Conditions.value_counts(normalize=True).plot(kind='bar',color='gold');

In [None]:
'''
accidents.Special_Conditions_at_Site = \
accidents.Special_Conditions_at_Site.replace([0,1,2,3,4,5,6,7,-1],  
                                                ['None', 'Auto traffic singal - out', 
                                                 'Auto signal part defective', 
                                                 'Road sign or marking defective or obscured', 
                                                 'Roadworks', 'Road surface defective', 
                                                 'Oil or diesel', 'Mud','None'])
accidents.Special_Conditions_at_Site.value_counts(normalize=True).plot(kind='barh',color='gold');
'''


In [None]:


'''
accidents.Carriageway_Hazards.replace([0,1,2,3],  
                                                ['None', 'Vehicle load on road', 
                                                 'Other object on road', 
                                                 'Previous accident'
])

accidents.Carriageway_Hazards.value_counts(normalize=True).plot(kind='barh',color='gold');

'''

In [None]:
accidents['Month'] = accidents['Month'].astype(int)

accidents['Month'] = accidents['Month'].replace([1,2,3,4,5,6,7,8,9,10,11,12],['January', 'February', 
                                                 'March','April', 'May','June',
                                                 'July', 'August', 'September',
                                                'October','November','December'
])

accidents['Year'] = accidents['Year'].astype(int)

In [None]:
def dark_or_not(value):
    if value == 'Daylight':
        return 'Yes'
    else:
        return 'No'
accidents['Daylight?'] = accidents['Light_Conditions'].apply(dark_or_not)
accidents['Daylight?'].value_counts().plot(kind='bar',color='coral');

In [None]:
accidents.Weather_Conditions.value_counts()

def good_weather(value):
    if value == 'Fine no high winds':
        return 'Yes'
    else:
        return 'No'
accidents['Good_weather_conditions'] =  accidents.Weather_Conditions.apply(good_weather)
accidents['Good_weather_conditions'].value_counts().plot(kind='barh')

In [None]:
accidents.columns

In [None]:
accidents.describe().T

In [None]:
df1 = accidents.groupby('Year')\
.agg({'Accident_Index':'count', 'Number_of_Vehicles': 'sum','Number_of_Casualties': 'sum','Accident_Severity':'count'})\
.reset_index()

In [None]:
'''

mport altair as alt
from vega_datasets import data

source = df1

alt.Chart(source).mark_bar().encode(
    x='Year:O',
    y="Accident_Index:Q",    # The highlight will be set on the result of a conditional statement
    color=alt.condition(
        alt.datum.Year == 1989,  # If the year is 1810 this test returns True,
        alt.value('blue'),     # which sets the bar orange.
        alt.value('steelblue')   # And if it's not true it sets the bar steelblue.
        
    )
).properties(
    width=600,title="Relación peso del cerebro y del cuerpo"
).interactive()
    

'''


# Analysis and Visualization

In [None]:
# Utilizarmos resample de la fecha para ponerla como índice y agruparla por mes, de esta manera
# podemos generar el gráfico con el total mensual y luego poder calcular una media con una ventana=12
# que son lso meses del año.

sns.set_style('white')
fig, ax = plt.subplots(figsize=(28,10))

accidents.set_index('Date').resample('M').size().plot(label='Total por Mes', color='grey', ax=ax)
accidents.set_index('Date').resample('M').size().rolling(window=12).mean()\
                           .plot(color='lightgreen', linewidth=5, label='Media mensual 12 meses', ax=ax)

ax.set_title('Accidents per Month', fontsize=25, fontweight='bold')
ax.set(ylabel='Total Count\n', xlabel='')
ax.legend()

sns.despine(ax=ax, top=True, right=True, left=True, bottom=False);

In [None]:
df1 = accidents.groupby(['Year'])\
.agg({'Accident_Index':'count', 'Number_of_Vehicles': 'sum','Number_of_Casualties': 'sum',})\
.reset_index()
#-------------------------
sns.set_style("white")

x = df1.Year
labels = df1.Year
width = 0.5
Accidentcounts = df1['Accident_Index']
Casualtycounts =  df1['Number_of_Casualties']
fig,ax =  plt.subplots(figsize=(18,7))

bar1 = ax.bar(x - width/2, Accidentcounts, width, label='Accident counts', color = 'lightgrey');
bar2 = ax.bar(x + width/2, Casualtycounts, width, label='Casualty counts', color = 'lightskyblue');
bar1[10].set_color('orange')
bar2[10].set_color('purple')
ax.set_title('\nAccidents / Casualties \n per Year\n', fontsize=25, fontweight='bold')
ax.set_xlabel('Year',fontsize=15)
ax.set_ylabel('Total counts\n',fontsize=15)
ax.legend()
ax.set_xticks(x)
sns.despine(ax=ax, top=True, right=True, left=True, bottom=False);
plt.show();

In [None]:
accidents.head()

# En qué meses hay más accidentes

In [None]:
df2 = accidents.groupby(['Month'])['Accident_Index'].count().reset_index()

months = ['January', 'February','March','April', 'May','June','July','August', 'September','October','November','December']
df2['Month'] = pd.Categorical(df2['Month'], categories=months, ordered=True)
#df2.sort_values(...)  # same as you have now; can use inplace=True
df2 = df2.sort_values(by='Month')


sns.set_style("white")

x = df2['Month']
y = df2['Accident_Index']
fig, ax =  plt.subplots(figsize=(15,8))

ax.plot(x,y,color='lightskyblue',linewidth=4)
ax.set_title('Accidents per Month', fontsize=25, fontweight='bold')
ax.set_xlabel('\n Month',fontsize=15)
ax.set_ylabel('Total Count\n',fontsize=15)
sns.despine(ax=ax, top=True, right=True, left=True, bottom=False);
plt.show();



In [None]:
cas_ratio= accidents.groupby(['Year'])['Number_of_Casualties'].sum().reset_index()

cas_ratio['Casualty_Ratio'] = cas_ratio['Number_of_Casualties'].div(cas_ratio['Number_of_Casualties'].sum())


veh_ratio = accidents.groupby(['Year'])['Number_of_Vehicles'].sum().reset_index()

veh_ratio['Vehicle_Ratio'] = veh_ratio['Number_of_Vehicles'].div(veh_ratio['Number_of_Vehicles'].sum())

#df_census = df_census.groupby('NAME')[['NAME', 'TOTAL_POPULATION']].sum().reset_index()

# Standardizing the values so as to conform the population ratio
#df_census['Population_Ratio'] = df_census['TOTAL_POPULATION'].div(df_census['TOTAL_POPULATION'].sum())
df2
merged = pd.merge(veh_ratio,cas_ratio,on='Year',how='inner')


In [None]:
merged

In [None]:
accidents.head()

In [None]:
df3 = accidents.groupby(['Day_of_Week'])['Accident_Index'].count().sort_values(ascending=False).reset_index()
df3
days = ['Sunday', 'Monday', 'Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday']
df3['Day_of_Week'] = pd.Categorical(df3['Day_of_Week'], categories=days, ordered=True)

df3 = df3.sort_values(by='Day_of_Week',ascending=True)
df3

## Qué dia de la semana hay más accidentes?

In [None]:
# sns.set_style('white')
fig, ax = plt.subplots(figsize=(10,5))

barlist = plt.bar(df3['Day_of_Week'],df3['Accident_Index'],color='lightgreen')
barlist[5].set_color('r')

ax.set_title('\nAccidents per Weekday\n', fontsize=14, fontweight='bold')
ax.set(ylabel='\nTotal Counts',xlabel='\nWeekDay')


# remove all spines
sns.despine(ax=ax, top=True, right=True, left=True, bottom=True);

## Mapa de calor Weekday vs Daytime

In [None]:
df4 = accidents.groupby(['Day_of_Week','Daytime'])['Accident_Index'].count().reset_index()


days = ['Sunday', 'Monday', 'Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday']
df4['Day_of_Week'] = pd.Categorical(df4['Day_of_Week'], categories=days, ordered=True)
df4 = df4.pivot(index='Day_of_Week', columns='Daytime', values='Accident_Index')
df4.sort_values(by='Day_of_Week',ascending=True)

In [None]:
cmap = cm.get_cmap('PuBu')
plt.figure(figsize=(15,10))
weekday = ['Sunday', 'Monday', 'Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday']
number=[1,2,3,4,5,6,7]
sns.heatmap(df4, cmap=cmap)
plt.title('\nAccidents by Weekday and Daytime\n', fontsize=14, fontweight='bold');
plt.yticks(rotation=0)

In [None]:
accidents.columns

In [None]:
df5 = accidents.groupby(['Day_of_Week','Accident_Severity','Daylight?'])['Number_of_Casualties'].sum().reset_index()
dayweek = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
df5['Day_of_Week'] = pd.Categorical(df5['Day_of_Week'], categories=dayweek, ordered=True)
#df2.sort_values(...)  # same as you have now; can use inplace=True
df5 = df5.sort_values(by='Day_of_Week',ascending=True)
df5.head()

In [None]:
accidents.head()

import altair as alt
from vega_datasets import data

source = df5

alt.Chart(source).mark_circle().encode(
    alt.X('Number_of_Casualties',scale=alt.Scale(zero=False)),
    alt.Y('Day_of_Week',scale=alt.Scale(zero=False, padding=1)),
    color='Accident_Severity',
    size='Daylight?'
).properties(
    width=1000,
    height=500).interactive()
#por mas que lo intento no me ordena por weekday

In [None]:
sns.set_style('white')
fig, ax = plt.subplots(figsize=(15,8))


accidents.Hour.hist(bins=24, ax=ax, color='lightpink')
ax.set_title('\nAccidents by Time\n', fontsize=14, fontweight='bold')
ax.set(xlabel='\nHour of the Day', ylabel='\n Total Count of Accidents')

# remove all spines
sns.despine(top=True, right=True, left=True, bottom=False);

## Accident Severity vs Daytime

In [None]:
#df6 = accidents.groupby(['Daytime','Accident_Severity'])['Accident_Index'].count().reset_index()
fig, ax = plt.subplots(1, figsize=(30,10))
accidents.groupby('Daytime')['Accident_Severity'].value_counts(normalize=True).unstack().plot(kind='barh', stacked=True, color=['red', 'orange', 'lightgrey'], ax=ax)
ax.legend(loc='best', bbox_to_anchor=(1,1))

In [None]:
fig, ax = plt.subplots(1, figsize=(30,10))

accidents.groupby('Speed_limit')['Accident_Severity'].value_counts(normalize=True).unstack().plot(kind='bar', stacked=True, color=['black', 'orange', 'lightgrey'], ax=ax)

ax.legend(loc='best', bbox_to_anchor=(1,1))

ax.set_title('Accident severity proportions at different speed limits')
ax.set_xlabel('Road speed limit (mph)',rotation=0)
ax.set_ylabel('Proportion of accidents')
plt.xticks(rotation=0)
plt.show()

In [None]:
#HIST DE LAS VARIABLES NUMERICAS


accidents.hist(figsize=(15,12));
plt.tight_layout()
#PARTE DE CONCATENAR LOS DATAFRAMES

In [None]:
accidents.head()

## VEHICLES

In [None]:
%reset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
cmap = cm.get_cmap
import seaborn as sns
pd.set_option('display.max_columns', None)
%matplotlib inline
import io

In [None]:
veh=pd.read_csv('D:/Descargas/Stats19-Data1979-2004/Vehicles7904.csv',delimiter=',',encoding='UTF-8-SIG',usecols=[
    'Acc_Index', 'Vehicle_Reference', 'Vehicle_Type',
    'Was_Vehicle_Left_Hand_Drive?','Sex_of_Driver',
    'Age_Band_of_Driver','Engine_Capacity_(CC)', 
    'Propulsion_Code', 'Age_of_Vehicle'
])

In [None]:
print("NaN data: \n \n"," \n ",veh.isna().sum()/len(veh),"%")
print("\n Missing or out of range data:\n",np.abs(veh[veh==-1].sum())/len(veh),"%")

In [None]:
# drop missing or out of range values
veh.drop(labels='Was_Vehicle_Left_Hand_Drive?',inplace=True,axis=1)
veh.shape

In [None]:
# drop missing or out of range values
for i,k in enumerate(veh):
    veh.drop(index=veh[veh[k] == -1].index, inplace=True,axis=1)

#veh.drop(index=veh[veh['Sex_of_Driver'] == -1].index, inplace=True)

In [None]:
veh.describe().T

In [None]:
veh.shape

In [None]:
veh.head()

## CASUALTIES

In [None]:
cas=pd.read_csv('D:/Descargas/Stats19-Data1979-2004/Casualty7904.csv',delimiter=',',encoding='UTF-8-SIG')

In [None]:
print("NaN data: \n \n"," \n ",veh.isna().sum()/len(veh),"%")
print("\n Missing or out of range data:\n",np.abs(veh[veh==-1].sum())/len(veh),"%")

In [None]:
print(cas.describe().T)
cas.shape
cas.columns

In [None]:
%who
df_merge = pd.merge(cas,veh,how='inner',on='Acc_Index')

In [None]:
del veh,cas
df_merge.columns
df_merge.shape

In [None]:
print(df_merge.shape)
df_merge.head()

In [None]:
print("NaN data: \n \n"," \n ",df_merge.isna().sum()/len(df_merge),"%")
print("\n Missing or out of range data:\n",np.abs(df_merge[df_merge==-1].sum())/len(df_merge),"%")

In [None]:
df_merge.drop(labels=['Casualty_Home_Area_Type','Pedestrian_Road_Maintenance_Worker','Pedestrian_Location',
              'Casualty_Class','Vehicle_Reference_y','Vehicle_Reference_x','Casualty_Home_Area_Type',
              'Pedestrian_Road_Maintenance_Worker','Car_Passenger','Pedestrian_Movement','Pedestrian_Location',
                'Casualty_Severity','Casualty_Reference','Bus_or_Coach_Passenger'],inplace=True,axis=1)


In [None]:
df_merge.head()

In [None]:
for i,k in enumerate(df_merge):
    df_merge.drop(index=df_merge[df_merge[k] == -1].index, inplace=True,axis=1)

In [None]:
#Drop unknown values of Sex_of_Driver
df_merge.drop(index=df_merge[df_merge['Sex_of_Driver'] == 3].index, inplace=True,axis=1)

In [None]:
df_merge.Sex_of_Casualty = df_merge.Sex_of_Casualty.replace([1,2],['Male','Female'])

df_merge.Age_Band_of_Casualty = df_merge.Age_Band_of_Casualty.replace([1,2,3,4,5,6,7,8,9,10,11],['0 - 5','6 - 10','11 - 15',
                                                                                                 '16 - 20','21 - 25','26 - 35',
                                                                                              '36 - 45','46 - 55','56 - 65',
                                                                                              '66 - 75','Over 75'
                                                                                                ])

values1=['Pedestrian',
'Cyclist',
'Motorcycle 50cc and under rider or passenger',
'Motorcycle 125cc and under rider or passenger',
'Motorcycle over 125cc and up to 500cc rider or  passenger',
'Motorcycle over 500cc rider or passenger',
'Taxi/Private hire car occupant',
'Car occupant',
'Minibus (8 - 16 passenger seats) occupant',
'Bus or coach occupant (17 or more pass seats)',
'Horse rider',
'Agricultural vehicle occupant',
'Tram occupant',
'Van / Goods vehicle (3.5 tonnes mgw or under) occupant',
'Goods vehicle (over 3.5t. and under 7.5t.) occupant',
'Goods vehicle (7.5 tonnes mgw and over) occupant',
'Mobility scooter rider',
'Electric motorcycle rider or passenger',
'Other vehicle occupant',
'Motorcycle - unknown cc rider or passenger',
'Goods vehicle (unknown weight) occupant',
'Motorcycle - Scooter rider or passenger',
'Motorcycle rider or passenger',
'Motorcycle - Combination rider or passenger',
'Motorcycle over 125cc rider or passenger',
'Taxi (excluding private hire cars) occupant',
'Car occupant (including private hire cars)',
'Minibus/Motor caravan occupant',
'Goods vehicle (over 3.5 tonnes) occupant']
values2 = [0,1,2,3,4,5,8,9,10,11,16,17,18,19,
20,21,22,23,90,97,98,103,104,105,
106,108,109,110,113]

values3 = [1,2,3,10,11,17,19,20,
21,90,103,104,105,
106,108,109,110,113]

values4 = ['Pedal cycle',
'Motorcycle 50cc and under',
'Motorcycle 125cc and under',
'Minibus (8 - 16 passenger seats)',
'Bus or coach (17 or more pass seats)',
'Agricultural vehicle',
'Van / Goods 3.5 tonnes mgw or under',
'Goods over 3.5t. and under 7.5t',
'Goods 7.5 tonnes mgw and over',
'Other vehicle',
'Motorcycle - Scooter',
'Motorcycle',
'Motorcycle - Combination',
'Motorcycle over 125cc',
'Taxi (excluding private hire cars)',
'Car (including private hire cars)',
'Minibus/Motor caravan',
'Goods vehicle over 3.5 tonnes']



df_merge.Casualty_Type = df_merge.Casualty_Type.replace(values2,values1)


df_merge.Vehicle_Type = df_merge.Vehicle_Type.replace(values3,values4)

df_merge.Sex_of_Driver = df_merge.Sex_of_Driver.replace([1,2],['Male','Female'])

df_merge.Age_Band_of_Driver = df_merge.Age_Band_of_Driver.replace([1,2,3,4,5,6,7,8,9,10,11],['0 - 5','6 - 10','11 - 15',
                                                                                                 '16 - 20','21 - 25','26 - 35',
                                                                                              '36 - 45','46 - 55','56 - 65',
                                                                                              '66 - 75','Over 75'
                                                                                                ])
#df_merge.Engine_Capacity_(CC) = df_merge.Engine_Capacity_(CC).replace([1,2],['Male','Female']) -->drop column

df_merge.Propulsion_Code = df_merge.Propulsion_Code.replace([1,2,4,5,6,7,8,9],['Petrol',
                                                                    'Heavy oil',
                                                                    'Steam',
                                                                    'Gas',
                                                                    'Petrol/Gas (LPG)',
                                                                    'Gas/Bi-fuel',
                                                                    'Hybrid electric',
                                                                    'Fuel cells'])


In [None]:
df_merge.head()

In [None]:
all_sex_driver = df_merge.groupby(['Age_Band_of_Driver','Sex_of_Driver']).size().reset_index()

female = all_sex_driver[all_sex_driver['Sex_of_Driver'] == 'Female']
male = all_sex_driver[all_sex_driver['Sex_of_Driver'] == 'Male']
male['percentage'] = (male[0]/male[0].sum())*100
female = all_sex_driver[all_sex_driver['Sex_of_Driver'] == 'Female']
female['percentage'] = (female[0]/female[0].sum())*100
female['percentage']=female['percentage'].astype(str)
male['percentage']=male['percentage'].astype(str)
female['percentage'] = female['percentage'].str[:4]+'%'
male['percentage'] = male['percentage'].str[:4]+'%'


all_sex_casualty = df_merge.groupby(['Age_Band_of_Casualty','Sex_of_Casualty'])['Acc_Index'].count().reset_index()
female_cs = all_sex_casualty[all_sex_casualty['Sex_of_Casualty'] == 'Female']
male_cs = all_sex_casualty[all_sex_casualty['Sex_of_Casualty'] == 'Male']

female_cs['Percentage'] = (female_cs['Acc_Index']/(female_cs['Acc_Index'].sum())*100).astype(str).str[:4]+'%'
male_cs['Percentage'] = (male_cs['Acc_Index']/(male_cs['Acc_Index'].sum())*100).astype(str).str[:4]+'%'
gender_driver =  pd.concat([male,female])
gender_casualty = pd.concat([male_cs,female_cs])

In [None]:
'''
#FALTA PARSEAR LO DE LOS RANGOS DE EDAD EN EL EJE Y
fig, ax = plt.subplots(1, figsize=(15,10))
df_merge.groupby('Age_Band_of_Driver')['Sex_of_Driver'].value_counts(normalize=True).unstack().plot(kind='barh', stacked=False, color=['grey', 'orange'], ax=ax)
ax.legend(loc='best', bbox_to_anchor=(1,1),labels=['Male','Female'])


'''
''''''
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(1,1)
import plotly.express as px

fig1 = px.sunburst(gender_driver, path=['Sex_of_Driver','Age_Band_of_Driver','percentage'], color='Sex_of_Driver')
fig1.update_layout(height=600, width=600, title_text="Age_Band_of_Driver vs Sex")
fig1.show()
#PORCENTAJE DRIVERS IMPLICADOS



In [None]:
df_merge.head()
prueba = df_merge['Age_Band_of_Casualty'].sort_values().reset_index()
import plotly.express as px
fig2 = px.histogram(prueba, x="Age_Band_of_Casualty")
fig2.update_layout(height=700, width=700, title_text="Age_Band_of_Casualty")
fig2.show()

In [None]:
#df_merge.Engine_Capacity_(CC) = df_merge.Engine_Capacity_(CC).replace([1,2],['Male','Female']) -->drop column
veh_type = df_merge.groupby(['Vehicle_Type','Sex_of_Driver'])['Acc_Index'].count().reset_index()
veh_type['Percentage'] = (veh_type['Acc_Index']/veh_type['Acc_Index'].sum()*100).sort_values(ascending=True).astype(str).str[:4]+'%'

veh_type

In [None]:
df_merge.head()

In [None]:
df_merge.hist()

In [None]:
%reset

### **Feature Engineering**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
cmap = cm.get_cmap
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, \
explained_variance_score,recall_score,f1_score,precision_score,roc_curve,roc_auc_score
from sklearn.linear_model import LogisticRegression  #regresión logistica en clasificación
from sklearn.tree import DecisionTreeClassifier




%matplotlib inline

In [3]:
dfa = pd.read_csv('D:\Descargas\Stats19-Data1979-2004\Accidents7904.csv',delimiter=',',encoding='UTF-8-SIG',index_col=0,nrows=40000)

In [4]:
dfa.head()

Unnamed: 0_level_0,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,Day_of_Week,...,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,LSOA_of_Accident_Location
Accident_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
197901A11AD14,,,,,1,3,2,1,18/01/1979,5,...,-1,-1,1,8,1,-1,0,-1,-1,
197901A1BAW34,198460.0,894000.0,,,1,3,1,1,01/01/1979,2,...,-1,-1,4,8,3,-1,0,-1,-1,
197901A1BFD77,406380.0,307000.0,,,1,3,2,3,01/01/1979,2,...,-1,-1,4,8,3,-1,0,-1,-1,
197901A1BGC20,281680.0,440000.0,,,1,3,2,2,01/01/1979,2,...,-1,-1,4,8,3,-1,0,-1,-1,
197901A1BGF95,153960.0,795000.0,,,1,2,2,1,01/01/1979,2,...,-1,-1,4,3,3,-1,0,-1,-1,


In [5]:
dfa.shape

(40000, 31)

In [6]:
dfa.dtypes

Location_Easting_OSGR                          float64
Location_Northing_OSGR                         float64
Longitude                                      float64
Latitude                                       float64
Police_Force                                     int64
Accident_Severity                                int64
Number_of_Vehicles                               int64
Number_of_Casualties                             int64
Date                                            object
Day_of_Week                                      int64
Time                                            object
Local_Authority_(District)                       int64
Local_Authority_(Highway)                        int64
1st_Road_Class                                   int64
1st_Road_Number                                  int64
Road_Type                                        int64
Speed_limit                                      int64
Junction_Detail                                  int64
Junction_C

#### **Convert categorical variables to str**

In [7]:
dfa.isna().sum()/len(dfa)

Location_Easting_OSGR                          0.004150
Location_Northing_OSGR                         0.004150
Longitude                                      1.000000
Latitude                                       1.000000
Police_Force                                   0.000000
Accident_Severity                              0.000000
Number_of_Vehicles                             0.000000
Number_of_Casualties                           0.000000
Date                                           0.000000
Day_of_Week                                    0.000000
Time                                           0.000325
Local_Authority_(District)                     0.000000
Local_Authority_(Highway)                      0.000000
1st_Road_Class                                 0.000000
1st_Road_Number                                0.000000
Road_Type                                      0.000000
Speed_limit                                    0.000000
Junction_Detail                                0

In [8]:
dfa['Hour'] = pd.to_datetime(dfa['Time']).dt.hour
dfa.Hour.fillna(dfa.Hour.median(), inplace=True)

In [9]:
#for i,k in enumerate(accidents):
np.abs(dfa[dfa == -1].sum())/len(dfa)
    
#for i,k in enumerate(accidents):
   # accidents.drop(index=accidents[accidents[k] == -1].index, inplace=True)

Location_Easting_OSGR                               0.0
Location_Northing_OSGR                              0.0
Longitude                                           0.0
Latitude                                            0.0
Police_Force                                        0.0
Accident_Severity                                   0.0
Number_of_Vehicles                                  0.0
Number_of_Casualties                                0.0
Date                                                0.0
Day_of_Week                                         0.0
Time                                                0.0
Local_Authority_(District)                          0.0
Local_Authority_(Highway)                           0.0
1st_Road_Class                                      0.0
1st_Road_Number                                     0.0
Road_Type                                      0.055125
Speed_limit                                         0.0
Junction_Detail                                 

In [10]:
dfa.drop(['Did_Police_Officer_Attend_Scene_of_Accident','Urban_or_Rural_Area',
                'Junction_Control','2nd_Road_Class',
                'Latitude','Longitude','Location_Easting_OSGR',
                'Location_Northing_OSGR','LSOA_of_Accident_Location','Road_Type',
          'Junction_Detail','Junction_Control','2nd_Road_Class',
          'Pedestrian_Crossing-Physical_Facilities',
          'Pedestrian_Crossing-Human_Control','Urban_or_Rural_Area',
         'Special_Conditions_at_Site'
         ],
               inplace=True,axis=1)




In [11]:
dfa.drop(['Date'],inplace=True,axis=1)

In [12]:
#DROP MISSING VALUES =  -1
for i,k in enumerate(dfa):
    dfa.drop(index=dfa[dfa[k] == -1].index, inplace=True)

In [13]:
dfa.drop(['Time'],axis=1,inplace=True)

In [14]:
#dropna's
dfa.dropna(inplace=True)

In [15]:
'''dfa['Local_Authority_(Highway)'].value_counts(normalize=True)
# the 9999 it's not a fiable data, it's reference cannot be find at the legend excel and the percentage it's consiredable.
# we decided drop that variable.
dfa.drop(['Local_Authority_(Highway)'],inplace=True,axis=1)'''

"dfa['Local_Authority_(Highway)'].value_counts(normalize=True)\n# the 9999 it's not a fiable data, it's reference cannot be find at the legend excel and the percentage it's consiredable.\n# we decided drop that variable.\ndfa.drop(['Local_Authority_(Highway)'],inplace=True,axis=1)"

In [16]:
dfa.Hour = dfa.Hour.astype('int64')
dfa.columns

Index(['Police_Force', 'Accident_Severity', 'Number_of_Vehicles',
       'Number_of_Casualties', 'Day_of_Week', 'Local_Authority_(District)',
       'Local_Authority_(Highway)', '1st_Road_Class', '1st_Road_Number',
       'Speed_limit', '2nd_Road_Number', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions', 'Carriageway_Hazards',
       'Hour'],
      dtype='object')

In [17]:
dfa.dtypes

Police_Force                  int64
Accident_Severity             int64
Number_of_Vehicles            int64
Number_of_Casualties          int64
Day_of_Week                   int64
Local_Authority_(District)    int64
Local_Authority_(Highway)     int64
1st_Road_Class                int64
1st_Road_Number               int64
Speed_limit                   int64
2nd_Road_Number               int64
Light_Conditions              int64
Weather_Conditions            int64
Road_Surface_Conditions       int64
Carriageway_Hazards           int64
Hour                          int64
dtype: object

In [119]:
cat_vars = ['Police_Force', 'Accident_Severity','Day_of_Week','1st_Road_Class',
       'Speed_limit','Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions', 'Carriageway_Hazards',
       'Hour']

dfa[cat_vars] = dfa[cat_vars].astype(str)
dfa.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12601 entries, 197901A1BGF95 to 197901JRNBW79
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Police_Force                12601 non-null  object
 1   Accident_Severity           12601 non-null  object
 2   Number_of_Vehicles          12601 non-null  int64 
 3   Number_of_Casualties        12601 non-null  int64 
 4   Day_of_Week                 12601 non-null  object
 5   Local_Authority_(District)  12601 non-null  object
 6   1st_Road_Class              12601 non-null  object
 7   Speed_limit                 12601 non-null  object
 8   2nd_Road_Number             12601 non-null  object
 9   Light_Conditions            12601 non-null  object
 10  Weather_Conditions          12601 non-null  object
 11  Road_Surface_Conditions     12601 non-null  object
 12  Carriageway_Hazards         12601 non-null  object
 13  Hour                        126

In [19]:
dfa.dtypes

Police_Force                  object
Accident_Severity             object
Number_of_Vehicles             int64
Number_of_Casualties           int64
Day_of_Week                   object
Local_Authority_(District)    object
Local_Authority_(Highway)     object
1st_Road_Class                object
1st_Road_Number               object
Speed_limit                   object
2nd_Road_Number               object
Light_Conditions              object
Weather_Conditions            object
Road_Surface_Conditions       object
Carriageway_Hazards           object
Hour                          object
dtype: object

In [120]:
encoder = OneHotEncoder(handle_unknown='ignore') # allows handling new values

dfa2 = encoder.fit_transform(dfa[cat_vars])

In [121]:
dfa2.todense()

matrix([[1., 0., 1., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]])

In [123]:
num_vars =['Number_of_Vehicles','Number_of_Casualties']
scaler = StandardScaler()
num_scaler = dfa[num_vars]=scaler.fit_transform(dfa[num_vars])

In [None]:
pd.set_option('display.max_columns', None)
# acc_clean.hist(figsize=(20,20))

In [None]:
# instantiate labelencoder object
le = LabelEncoder()

# apply le on categorical feature columns
dfa[cat_vars] = dfa[cat_vars].apply(lambda col: le.fit_transform(col))    
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

#One-hot-encode the categorical columns.
#Unfortunately outputs an array instead of dataframe.
array_hot_encoded = ohe.fit_transform(dfa[cat_vars])

#Convert it to df
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=dfa.index)

#Extract only the columns that didnt need to be encoded
data_other_cols = dfa.drop(columns=cat_vars)

#Concatenate the two dataframes : 
data_out = pd.concat([data_hot_encoded, data_other_cols], axis=1)

In [None]:
dfadata_hot_encoded

In [None]:
target = acc_clean['Accident_Severity']
features = acc_clean.drop(['Accident_Severity'],axis=1)

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore') # allows handling new values

sparse = encoder.fit_transform(#HERE THE DATAFRAME.dropna().values.reshape(-1,1))
sparse

In [None]:
acc_clean['Local_Authority_(Highway)'].value_counts(normalize=True)

In [None]:
acc_clean.dropna(inplace=True)

In [None]:
acc_clean.describe().T

In [None]:
acc_clean.columns

In [None]:
acc_clean['Date'] = pd.to_datetime(acc_clean['Date'])

In [None]:
acc_clean.drop(['Time'],axis=1,inplace=True)

In [None]:
acc_clean.isna().sum()

In [None]:
dfa.columns[2:]

In [None]:
target = dfa[dfa.columns[1:2]]
features = dfa[dfa.columns[2:]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target)

In [None]:
lr = LogisticRegression()
tree = DecisionTreeClassifier()

lr.fit(X_train, y_train)
predictions = lr.predict(X_test)


In [None]:
print("Precision Score : ",precision_score(y_test, predictions, 
                                           pos_label='positive'
                                           average='micro'))
print("Recall Score : ",recall_score(y_test, predictions, 
                                           pos_label='positive'
                                           average='micro'))

In [116]:
def evaluate(X_test, ys_test, ys_hat):
    #plt.scatter(X_test, ys_test)
    #plt.scatter(X_test, ys_hat)

    mse = mean_squared_error(ys_test, ys_hat)
    mae = mean_absolute_error(ys_test, ys_hat)
    mape = np.mean(np.abs(ys_test - ys_hat) / ys_test)
    ev = explained_variance_score(ys_test, ys_hat)
    f1s = f1_score(y_test,ys_hat)

    return mse, mae, mape, ev, f1s

In [117]:
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score,recall_score,f1_score,\
precision_score,roc_auc_score,roc_curve,mean_absolute_percentage_error
# define the location of the dataset
# load the dataset
dataset = read_csv(url, header=None)
# retrieve the array of data
data = dfa.values
# separate into input and output columns
X = data[:, 2:].astype(str)
y = data[:,1:2].astype(str)
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
#one-hot encode input variables
'''onehot_encoder = OneHotEncoder()
onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)'''
#ordinal encode target variable
'''label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)'''
# define the model
model = LogisticRegression()
# fit on the training set
model.fit(X_train, y_train)
# predict on test set
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

evaluate(X_test,y_test,yhat)

  return f(*args, **kwargs)


Accuracy: 81.29


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


UFuncTypeError: ufunc 'subtract' did not contain a loop with signature matching types (dtype('<U1'), dtype('<U1')) -> dtype('<U1')

In [128]:
!git add .

The file will have its original line endings in your working directory
The file will have its original line endings in your working directory


array([2, 2, 2, ..., 2, 2, 2])

## FRONT END

In [None]:
categorical_cols = ['Local_Authority_(Highway)'] 

# instantiate labelencoder object
le = LabelEncoder()

# apply le on categorical feature columns
acc_clean[categorical_cols] = acc_clean[categorical_cols].apply(lambda col: le.fit_transform(col))    
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

#One-hot-encode the categorical columns.
#Unfortunately outputs an array instead of dataframe.
array_hot_encoded = ohe.fit_transform(data[categorical_cols])

#Convert it to df
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=data.index)

#Extract only the columns that didnt need to be encoded
data_other_cols = data.drop(columns=categorical_cols)

#Concatenate the two dataframes : 
data_out = pd.concat([data_hot_encoded, data_other_cols], axis=1

In [None]:
# Define which columns should be encoded vs scaled
 
columns_to_encode = ['Police_Force', 'Accident_Severity', 'Number_of_Vehicles',
       'Number_of_Casualties', 'Day_of_Week', 'Local_Authority_(District)', 
        '1st_Road_Class', '1st_Road_Number',
       'Road_Type', 'Speed_limit', 'Junction_Detail', '2nd_Road_Number',
       'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions',
       'Special_Conditions_at_Site', 'Carriageway_Hazards', 'Hour']

columns_to_scale  = ['Police_Force', 'Accident_Severity', 'Number_of_Vehicles',
       'Number_of_Casualties', 'Day_of_Week', 'Local_Authority_(District)', 
        '1st_Road_Class', '1st_Road_Number',
       'Road_Type', 'Speed_limit', 'Junction_Detail', '2nd_Road_Number',
       'Pedestrian_Crossing-Human_Control',
       'Pedestrian_Crossing-Physical_Facilities', 'Light_Conditions',
       'Weather_Conditions', 'Road_Surface_Conditions',
       'Special_Conditions_at_Site', 'Carriageway_Hazards', 'Hour']

# Instantiate encoder/scaler
scaler = StandardScaler()
ohe    = OneHotEncoder(sparse=False)
label_encoder = LabelEncoder()

x = acc_clean['Local_Authority_(Highway)']
y = label_encoder.fit_transform(x)
print(y)

encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
encoder.fit(data)
encoder.transform(new_data)


#Scale and Encode Separate Columns
scaled_columns  = scaler.fit_transform(acc_clean.columns) 
encoded_columns =    ohe.fit_transform(acc_clean.columns)

# Concatenate (Column-Bind) Processed Columns Back Together
#processed_data = np.concatenate([scaled_columns, encoded_columns], axis=1)

In [None]:
#Front END

In [None]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.graph_objects as go

app = dash.Dash(__name__)

app.layout = html.Div([
    html.P("Color:"),
    dcc.Dropdown(
        id="dropdown",
        options=[
            {'label': x, 'value': x}
            for x in ['Gold', 'MediumTurquoise', 'LightGreen']
        ],
        value='Gold',
        clearable=False,
    ),
    dcc.Graph(id="graph"),
])

@app.callback(
    Output("graph", "figure"), 
    [Input("dropdown", "value")])
def display_color(color):
    fig = go.Figure(
        data=go.Bar(y=[2, 3, 1], marker_color=color))
    return fig
app.run_server(host= '127.0.0.1',debug=False)

In [None]:
app.layout = html.Div(

    children=[

        html.H1(children="Avocado Analytics",),

        html.P(

            children="Analyze the behavior of avocado prices"

            " and the number of avocados sold in the US"

            " between 2015 and 2018",

        ),

        dcc.Graph(

            figure={

                "data": [

                    {

                        "x": data["Date"],

                        "y": data["AveragePrice"],

                        "type": "lines",

                    },

                ],

                "layout": {"title": "Average Price of Avocados"},

            },

        ),

        dcc.Graph(

            figure={

                "data": [

                    {

                        "x": data["Date"],

                        "y": data["Total Volume"],

                        "type": "lines",

                    },

                ],

                "layout": {"title": "Avocados Sold"},

            },

        ),

    ]

)