In [None]:
# Analysis and 

# *Exploratory Data Analysis*

#### **Import libraries**

In [None]:
import plotly.tools as tls
import plotly as py
import plotly.graph_objs as go

from matplotlib import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
cmap = cm.get_cmap
import seaborn as sns
import modules

%matplotlib inline
import io
pd.set_option('display.max_columns', None)
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import streamlit as st
import cufflinks as cf
cf.go_offline()
cf.set_config_file(theme='pearl',sharing='public',offline=True)

#### **Check encoding file**

In [None]:
import chardet
with open("../../Accidents7904.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

print(result)

#### **Import Data**

In [None]:
accidents=pd.read_csv('../../Accidents7904.csv',delimiter=',',encoding='UTF-8-SIG',low_memory=False)

In [None]:
accidents.shape

In [None]:
accidents.head()

In [None]:
accidents.info()

In [None]:
accidents.info()

In [None]:
accidents.describe().T

En general podemos ver que muchas de las variables en el dataset Accidents tienen una subcategoría en su variable que acumula más porcentaje,por lo tanto podemos asegurar que es un dataset desbalanceado, como por ejemplo (Accident_Severity, Road_type, Junction_Control, Light_Conditions ,Weather_Conditions), entre otras.

#### **Cleaning and Process the data**

##### **Checking percentage of missing or NaN values**

In [None]:
print("NaN data: \n \n"," \n ",accidents.isna().sum()/len(accidents),"%")
print("\n Missing or out of range data:\n",np.abs(accidents[accidents==-1].sum())/len(accidents),"%")
#accidents.isna().sum().sum()/len(accidents)

##### **Drop the useless columns**

In [None]:
columns_to_drop = ['Location_Easting_OSGR', 'Location_Northing_OSGR',
                    'Police_Force','Local_Authority_(District)',
                    'Local_Authority_(Highway)','1st_Road_Number', 
                    '2nd_Road_Number','Pedestrian_Crossing-Human_Control', 
                    'Pedestrian_Crossing-Physical_Facilities',
                    'Did_Police_Officer_Attend_Scene_of_Accident',
                    'LSOA_of_Accident_Location','Longitude','Latitude',
                    'Urban_or_Rural_Area','Junction_Control','2nd_Road_Class',
                    'Special_Conditions_at_Site','Carriageway_Hazards',
                    'Junction_Detail']

In [None]:
accidents.drop(labels=columns_to_drop,inplace=True,axis=1)

##### **Clean NaN values**

In [None]:
accidents.dropna(inplace=True)

The values = -1 are refered to missing values or incorrect data.

##### **Clean missing values = - 1**

In [None]:
for i,k in enumerate(accidents):
    accidents.drop(index=accidents[accidents[k] == -1].index, inplace=True)

In [None]:
accidents.shape

#### **Process the data**

The data have been processed in order to get a better performance on the analysis and to avoid the biased data.
That involves in replace number category by original labels, generate new columns and drop the columns we don't need  and create functions that help us in this process.


##### **Date, Month, Year, Hour**

In [None]:
accidents['Date']=pd.to_datetime(accidents['Date'])
accidents['Month']=accidents['Date'].dt.month
accidents['Year']=accidents['Date'].dt.year
accidents['Hour'] = pd.to_datetime(accidents['Time'], format='%H:%M').dt.hour

#Replace int by Months
accidents['Month'] = accidents['Month'].replace([1,2,3,4,5,6,7,8,9,10,11,12],['January', 'February', 
                                                 'March','April', 'May','June',
                                                 'July', 'August', 'September',
                                                'October','November','December'
])
#conver to integer values
accidents['Year'] = accidents['Year'].astype(int)

##### **Drop Time column**

In [None]:
accidents.drop(['Time'],axis=1,inplace=True)

##### *Let's create the Daytime column with these values:*
- Morning
- Afternoon
- Rush_Hour
- Evening
- Night



In [None]:
#Define function to return hours in daytime groups.
def daytime(hour):
    if hour >= 5 and hour < 10:
        return "Commuting to work"
    elif hour >= 10 and hour < 15:
        return "Office hours"
    elif hour >= 15 and hour < 19:
        return "Commuting to home"
    elif hour >= 19 and hour < 23:
        return "Evening"
    else:
        return "Night"
accidents['Daytime'] = accidents['Hour'].apply(daytime)


In [None]:
accidents['Daytime'].value_counts(normalize=True)

In [None]:
print("\n",accidents['Light_Conditions'].value_counts(normalize=True))
print("\n",accidents['Road_Surface_Conditions'].value_counts(normalize=True))
print("\n",accidents['Weather_Conditions'].value_counts(normalize=True))

Como podemos ver el problema aquí es que muchas de las subcategorías apenas tienen datos e información que nos puedan aportar algo.
Por ello se crean nuevas columnas intentando cambiar un poco la estrategia y acumulando más porcentaje para cada categoría nueva creada.

As we can see the problem here is that many of the subcategories hardly have data and information that can contribute something to us.  For this reason, new columns are created trying to change the strategy a little and accumulating more percentage for each new category created.


##### **Accident Conditions**

In [None]:
#Replacing integer values by labels
accidents.Light_Conditions = accidents.Light_Conditions.replace([1,4,5,6,7], 
                                                      ['Daylight', 
                                                       'Darkness - lights lit', 
                                                       'Darkness - lights unlit', 
                                                       'Darkness - no lighting', 
                                                       'Darkness - lighting unknown'])
accidents.Weather_Conditions = accidents.Weather_Conditions.replace([1,2,3,4,5,6,7,8,9], 
                                                                ['Fine no high winds', 
                                                                 'Raining no high winds', 
                                                                 'Snowing no high winds', 
                                                                 'Fine + high winds', 
                                                                 'Raining + high winds', 
                                                                 'Snowing + high winds', 
                                                                 'Fog or mist', 'Other', 'Unknown', 
                                                                 ])


accidents.Road_Surface_Conditions = accidents.Road_Surface_Conditions.replace([1,2,3,4,5,6,7], 
                                                                                    ['Dry', 
                                                                                     'Wet or damp', 
                                                                                     'Snow', 
                                                                                     'Frost or ice', 
                                                                                     'Flood over 3cm. deep',
                                                                                     'Oil or diesel',
                                                                                     'Mud'])
#define functions to groupby smaller groups
def road_conditions(cond):
    if cond != "Dry":
        return 'Bad'
    elif cond == 'Dry':
        return 'Good'
accidents['Road_Surface_Conditions_2'] = accidents['Road_Surface_Conditions'].apply(road_conditions)

def weather_conditions(value1):
    if value1 == 'Fine no high winds':
        return 'Good'
    else:
        return 'Bad'
accidents['Weather_Conditions_2'] =  accidents.Weather_Conditions.apply(weather_conditions)


def light_conditions(value2):
    if value2 == 'Daylight':
        return 'Good'
    else:
        return 'Bad'
accidents['Light_Conditions_2'] = accidents['Light_Conditions'].apply(light_conditions)

In [None]:
print(accidents['Light_Conditions_2'].value_counts(normalize=True))
print(accidents['Road_Surface_Conditions_2'].value_counts(normalize=True))
print(accidents['Weather_Conditions_2'].value_counts(normalize=True))

In [None]:
accidents.drop(['Light_Conditions','Weather_Conditions','Road_Surface_Conditions'],inplace=True,axis=1)

##### **Speed**

In [None]:
def speed_groups(speed):
    if speed < 10:
        return "0-10"
    elif speed >=10 and speed < 20:
        return "10-20"
    elif speed >=20 and speed < 30:
        return "20-30"
    elif speed >=30 and speed < 40:
        return "30-40"
    elif speed >=40 and speed < 50:
        return "40-50"
    elif speed >=50 and speed < 60:
        return "50-60"
    elif speed >=60 and speed < 70:
        return "60-70"
    elif speed >=70 and speed < 80:
        return "70-80"
    elif speed >=80 and speed < 90:
        return "80-90"
    
accidents['Speed_limit_2'] =  accidents['Speed_limit'].apply(speed_groups)
accidents['Speed_limit_2'].value_counts()

##### **Accident_Severity**

In [None]:
accidents['Accident_Severity'].replace([1,2,3],['Fatal','Serious','Slight'],inplace=True)


accidents['Accident_Severity'].value_counts(normalize=True)

##### **Day_of_Week**

In [None]:
days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
accidents['Day_of_Week'].replace([1,2,3,4,5,6,7],days, inplace=True)
accidents.Day_of_Week.value_counts(normalize=True).sort_values(ascending=True)


##### **1st_Road_Class**

In [None]:
#Put Motorway and A(M) in the same category
accidents['1st_Road_Class'].replace([1,2,3,4,5,6],['Motorway','A(M)','A', 'B', 'C', 'Unclassified'],inplace=True)

accidents['1st_Road_Class'] = accidents['1st_Road_Class'].replace('A(M)', 'Motorway')
accidents['1st_Road_Class'].value_counts(normalize=True)

##### **Road_Type**

In [None]:
accidents['Road_Type'].replace([1,2,3,6,7,9,12],['Roundabout','One way street','Dual carriageway','Single carriageway','Slip road','Unknown','One way street/Slip road'],inplace=True)
accidents.Road_Type.value_counts(normalize=True)

In [None]:
#accidents.to_csv('../../accidents_clean.csv',sep=',')

# Analysis and Visualization

In [None]:
sns.set_context("talk", font_scale=1.1)
fig,ax = plt.subplots(figsize=(14,8))
sns.scatterplot(x="Number_of_Casualties", 
                y="Number_of_Vehicles",
                size="Accident_Severity",
                sizes=(20,500),
                alpha=0.5,
                hue="Speed_limit_2",
                data=accidents)
# Put the legend out of the figure
plt.legend(bbox_to_anchor=(1.01, 1))
#plt.legend(bbox_to_anchor=(1.01, 0.54),  borderaxespad=0.)
plt.legend(fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel("Number_of_Casualties",fontsize=15)
plt.ylabel("Number_of_Vehicles",fontsize=15)
plt.title("Scatter plot - Numerical Variables",fontsize=18,fontweight="bold")
plt.tight_layout()
sns.despine(ax=ax, top=True, right=True, left=True, bottom=False);
plt.show()

- Podemos ver que  lo que más predomina son accidentes leves con velocidades comprendidas entre 70-80 km/h (naranja), 60-70km/h(violeta) y 30-40km/h (azul).
- En cuanto a los vehículos implicados se observa que se acumulan más los datos entre 0-2 , no obstante en las víctimas predominan datos acumulados desde 0 hasta 40, siendo más intenso entre 0 y 10.

In [None]:
fig = accidents['Accident_Severity'].value_counts(normalize=True)\
                    .reset_index().iplot(kind='pie',dimensions=(750,400),
                    labels='index',values='Accident_Severity',
                    textinfo='percent+label',hole=0.4,
                    color = ['lightgreen', 'orange','red'],title='Accident Severity Chart',
                    asFigure=True,)
fig.update_layout(legend=dict(
    yanchor="top",
    y=1.15,
    xanchor="left",
    x=0.85,
        ),
title_x=0.53,title_y=0.95)
fig.show()

- En este Gráfico vemos como la variable que nos hemos definido como target, está muy desbalanceada, sólo con un aprox. 2% de datos fatales, 73% en leves y 24% en graves.

In [None]:
# Utilizarmos resample de la fecha para ponerla como índice y agruparla por mes, de esta manera
# podemos generar el gráfico con el total mensual y luego poder calcular una media con una ventana=12
# que son lso meses del año.

sns.set_style('white')
fig, ax = plt.subplots(figsize=(16,6))

accidents.set_index('Date').resample('M').size().plot(label='Total por Mes', color='grey', ax=ax)
accidents.set_index('Date').resample('M').size().rolling(window=12).mean()\
                           .plot(color='lightgreen', linewidth=5, label='Media mensual 12 meses', ax=ax)

ax.set_title('Accidents per Month', fontsize=18, fontweight='bold')
ax.set(ylabel='Total Count\n', xlabel='Years')
ax.legend(fontsize=10)
ax.set_xlabel('Year',fontsize=15)
ax.set_ylabel('Total counts\n',fontsize=15)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
#ax.set_xticklabels(["1979", "1984", "1989", "1994","1999","2004"], fontsize=12)
#ax.set_yticklabels(["16k", "18k", "20k", "22k","24k"], fontsize=12)
sns.despine(ax=ax, top=True, right=True, left=True, bottom=False);

In [None]:
accidents.dtypes

- Por alguna razón los accidentes tienden a bajar a media que van pasando los años, podría indicarnos una clara mejora en los sistemas de seguridad en los automóbiles.

- Se puede apreciar un pico alrededor de los años 90, donde probablemente se deba a que fué una época donde las compañías de vehículos comercializaron muchos coches accesibles al usuario medio y con una gran cilindrada y unos sistemas de seguridad menos eficientes que hoy en día.

In [None]:
df1 = accidents.groupby(['Year'])\
.agg({'Accident_Index':'count', 'Number_of_Vehicles': 'sum','Number_of_Casualties': 'sum',})\
.reset_index()
#-------------------------
sns.set_style("white")

x = df1.Year
labels = df1.Year
width = 0.5
Accidentcounts = df1['Accident_Index']
Casualtycounts =  df1['Number_of_Casualties']
fig,ax =  plt.subplots(figsize=(16,6))

bar1 = ax.bar(x - width/2, Accidentcounts, width, label='Accident counts', color = 'paleturquoise');
bar2 = ax.bar(x + width/2, Casualtycounts, width, label='Casualty counts', color = 'slategrey');
bar1[10].set_color('moccasin')
bar2[10].set_color('lightcoral')
ax.legend(fontsize=10)
ax.set_title('\nAccidents / Casualties \n per Year\n', fontsize=18, fontweight='bold')
ax.set_xlabel('\nYear',fontsize=15)
ax.set_ylabel('Total counts\n',fontsize=15)
ax.legend(fontsize=10)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
ax.legend()
ax.set_xticks(x)
sns.despine(ax=ax, top=True, right=True, left=True, bottom=False);
plt.show();

 - Como hemos visto previamente el año con más accidentes y víctimas es 1989.

### **En qué meses hay más accidentes**

In [None]:
cmap = plt.cm.get_cmap('Spectral')

df2 = accidents.groupby(['Month'])['Accident_Index'].count().reset_index()

months = ['January', 'February','March','April', 'May','June','July','August', 'September','October','November','December']
df2['Month'] = pd.Categorical(df2['Month'], categories=months, ordered=True)
#df2.sort_values(...)  # same as you have now; can use inplace=True
df2 = df2.sort_values(by='Month')


sns.set_style("white")

x = df2['Month']
y = df2['Accident_Index']
fig, ax =  plt.subplots(figsize=(16,6))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
bar1 = ax.bar(x,y,color='cornflowerblue',linewidth=4)
bar1[9].set_color('tomato')
ax.set_title('Accidents per Month', fontsize=18, fontweight='bold')
ax.set_xlabel('\n Month',fontsize=15)
ax.set_ylabel('Total Count\n',fontsize=15)
sns.despine(ax=ax, top=True, right=True, left=True, bottom=False);
plt.show();

- El més de Octubre es el que más accidentes acumula junto con Noviembre. Podríamos pensar que es por las fechas próximas a al navidad ya que es un patrón que se repite todos estos años.

## Qué dia de la semana hay más accidentes?

In [None]:
df3 = accidents.groupby(['Day_of_Week'])['Accident_Index'].count().sort_values(ascending=False).reset_index()
days = ['Sunday', 'Monday', 'Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday']
df3['Day_of_Week'] = pd.Categorical(df3['Day_of_Week'], categories=days, ordered=True)

df3 = df3.sort_values(by='Day_of_Week',ascending=True)


# sns.set_style('white')
fig, ax = plt.subplots(figsize=(10,6))

barlist = plt.bar(df3['Day_of_Week'],df3['Accident_Index'],color='midnightblue')
barlist[5].set_color('plum')

ax.set_title('\nAccidents per Weekday\n', fontsize=14, fontweight='bold')
ax.set_xlabel('\n Weekday',fontsize=15)
ax.set_ylabel('Total Count\n',fontsize=15)
# remove all spines
sns.despine(ax=ax, top=True, right=True, left=True, bottom=False);
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show();

> Claramente el viernes tiene sentido que sea cuando suceden más accidentes, cuando empieza el fin de semana.

### **Mapa de calor Weekday vs Daytime**

In [None]:
df4 = accidents.groupby(['Day_of_Week','Daytime'])['Accident_Index'].count().reset_index()


days = ['Sunday', 'Monday', 'Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday']
df4['Day_of_Week'] = pd.Categorical(df4['Day_of_Week'], categories=days, ordered=True)
df4 = df4.pivot(index='Day_of_Week', columns='Daytime', values='Accident_Index')

In [None]:
df4 = accidents.groupby(['Day_of_Week','Daytime'])['Accident_Index'].count().reset_index()


days = ['Sunday', 'Monday', 'Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday']
df4['Day_of_Week'] = pd.Categorical(df4['Day_of_Week'], categories=days, ordered=True)
df4 = df4.pivot(index='Day_of_Week', columns='Daytime', values='Accident_Index')

fig = df4.iplot(kind="heatmap",colorscale="Greens",dimensions=(670,520),title='Heatmap Daytime vs Weekday',asFigure=True)

fig.update_layout(legend=dict(
    yanchor="top",
    y=1.15,
    xanchor="left",
    x=0.85,
        ),
title_x=0.50)

In [None]:
import altair as alt
from vega_datasets import data
df5 = accidents.groupby(['Day_of_Week','Accident_Severity','Light_Conditions_2'])['Number_of_Casualties'].sum().reset_index()
days = ['Sunday', 'Monday', 'Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday']
df5['Day_of_Week'] = pd.Categorical(df5['Day_of_Week'], categories=days, ordered=True)
#df5.sort_values(by='Day_of_Week',ascending=True,inplace=True)



source = df5

alt.Chart(source).mark_circle().encode(
    alt.X('Number_of_Casualties',scale=alt.Scale(zero=False)),
    alt.Y('Day_of_Week',sort=days,scale=alt.Scale(zero=False, padding=1)),
    color='Accident_Severity',
    size='Light_Conditions_2'
).properties(
    width=900,
    height=400,
    title='Acc_Severity by Daytime and Daylight').interactive()


In [None]:
fig = accidents.Hour.iplot(kind='histogram',bins=40, theme="white", title="Accidents by Time",dimensions=(900,400),xTitle='Hour of the Day', yTitle='Count',colors="darkseagreen",asFigure=True)
fig.update_layout(title_x=0.5,title_y=0.85)
fig.show()

## Accident Severity vs Daytime

In [None]:
fig = accidents.groupby('Daytime')['Accident_Severity'].value_counts(normalize=True).unstack().iplot(title="Accident Severity  vs Daytime",kind='barh',dimensions=(750,400),barmode='stack', color=['red', 'orange', 'lightgreen'], ax=ax,asFigure=True)
fig.layout.xaxis.title = "Accidents Percentage"
fig.layout.yaxis.title = "DayTime"
fig.update_layout(title_x=0.5,title_y=0.85)
fig.show()

In [None]:
'''fig, ax = plt.subplots(1, figsize=(30,10))

accidents.groupby('Speed_limit_2')['Accident_Severity'].value_counts(normalize=True).unstack().plot(kind='bar', stacked=False, color=['red', 'orange', 'lightgreen'], ax=ax)

ax.legend(loc='best', bbox_to_anchor=(1,1))

ax.set_title('Accident severity proportions at different speed limits')
ax.set_xlabel('Road speed limit (mph)',rotation=0)
ax.set_ylabel('Proportion of accidents')
plt.xticks(rotation=0)
plt.show()
'''
fig1 = accidents.groupby(['Speed_limit_2'])['Accident_Severity']\
.value_counts(normalize=True).unstack().iplot(kind='barh',dimensions=(750,400),color=['yellow', 'lightskyblue', 'lightgrey'],asFigure=True,title="Speed limit vs Accident Severity")
fig1.layout.xaxis.title = "Accidents - Percentage"
fig1.layout.yaxis.title = "Speed Limit (Km/h)"
fig1.show()

fig2 = accidents.groupby(['Speed_limit_2'])['Number_of_Casualties'].sum().iplot(kind='bar',dimensions=(750,400),color=['blue', 'grey', 'yellow'],asFigure=True,title="Speed limit vs Casualties")
fig2.layout.xaxis.title = "Casualty count"
fig2.layout.yaxis.title = "Speed Limit (Km/h)"
fig2.show()

'''
fig.add_traces(trace1)
fig.add_traces(trace2)

fig['layout'].update(height=600, width=600, title='PARTICLES CORRELATION')
py.plot(fig, filename='subplots-shared-xaxes')'''

In [None]:
# Subplots setup and layout
#figs = cf.subplots([fig1, fig2],shape=(1,2))
#figs['layout'].update(height=400, width=1000,
#                     title='Test',title_x=0.45,title_y=0.9)
#iplot(figs)

In [None]:
'''fig,ax=plt.subplots(figsize=(12,8))
sns.countplot(data=accidents, x='Daytime',hue='Accident_Severity');

import altair as alt

dfg= accidents.groupby(['Daytime','Accident_Severity'])['Accident_Index'].count().reset_index()
source = dfg

alt.Chart(source).mark_bar().encode(
    x='Daytime:O',
    y='Accident_Index:Q',
    color='Accident_Severity:N',
    column=':N'
).properties(
    width=900,
    height=400).interactive()'''

In [None]:
accidents['Road_Type'].value_counts(normalize=True)

## VEHICLES

In [None]:
%reset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
cmap = cm.get_cmap
import seaborn as sns
pd.set_option('display.max_columns', None)
%matplotlib inline
import io

In [None]:
veh=pd.read_csv('../../Vehicles7904.csv',delimiter=',',encoding='UTF-8-SIG',usecols=[
    'Acc_Index','Vehicle_Type','Sex_of_Driver',
    'Age_Band_of_Driver','Engine_Capacity_(CC)','Age_of_Vehicle'
])

In [None]:
veh.head()

In [None]:
veh.describe().T

In [None]:
print("NaN data: \n \n"," \n ",veh.isna().sum()/len(veh),"%")
print("\n Missing or out of range data:\n",np.abs(veh[veh==-1].sum())/len(veh),"%")

In [None]:
# drop missing or out of range values
#veh.drop(labels='Was_Vehicle_Left_Hand_Drive?',inplace=True,axis=1)
veh.shape

In [None]:
# drop missing or out of range values
for i,k in enumerate(veh):
    veh.drop(index=veh[veh[k] == -1].index, inplace=True,axis=1)

#veh.drop(index=veh[veh['Sex_of_Driver'] == -1].index, inplace=True)

In [None]:
veh.describe().T

In [None]:
veh.shape

In [None]:
veh.head()

## CASUALTIES

In [None]:
cas=pd.read_csv('../../Casualty7904.csv',delimiter=',',encoding='UTF-8-SIG')

In [None]:
print("NaN data: \n \n"," \n ",cas.isna().sum()/len(cas),"%")
print("\n Missing or out of range data:\n",np.abs(cas[cas==-1].sum())/len(cas),"%")

In [None]:
cas.shape

In [None]:
print(cas.describe().T)

In [None]:
cas.columns

In [None]:
%who
df_merged = pd.merge(cas,veh,how='inner',on='Acc_Index')

In [None]:
del veh,cas
print(df_merged.columns)

In [None]:
print(df_merged.shape)
df_merged.head()

In [None]:
print("NaN data: \n \n"," \n ",df_merged.isna().sum()/len(df_merged),"%")
print("\n Missing or out of range data:\n",np.abs(df_merged[df_merged==-1].sum())/len(df_merged),"%")

In [None]:
df_merged.drop(labels=['Casualty_Home_Area_Type','Pedestrian_Road_Maintenance_Worker','Pedestrian_Location',
              'Casualty_Class','Casualty_Home_Area_Type',
              'Pedestrian_Road_Maintenance_Worker','Car_Passenger','Pedestrian_Movement','Pedestrian_Location',
                'Casualty_Severity','Casualty_Reference','Bus_or_Coach_Passenger','Vehicle_Reference'],inplace=True,axis=1)


In [None]:
df_merged.head()

In [None]:
for i,k in enumerate(df_merged):
    df_merged.drop(index=df_merged[df_merged[k] == -1].index, inplace=True,axis=1)

In [None]:
#Drop unknown values of Sex_of_Driver
df_merged.drop(index=df_merged[df_merged['Sex_of_Driver'] == 3].index, inplace=True,axis=1)

In [None]:
df_merged.shape

In [None]:
df_merged.to_csv('../../df_merged_cat.csv',sep=',',index = True)

In [None]:
df_merged.Sex_of_Casualty = df_merged.Sex_of_Casualty.replace([1,2],['Male','Female'])

df_merged.Age_Band_of_Casualty = df_merged.Age_Band_of_Casualty.replace([1,2,3,4,5,6,7,8,9,10,11],['0 - 5','6 - 10','11 - 15',
                                                                                                 '16 - 20','21 - 25','26 - 35',
                                                                                              '36 - 45','46 - 55','56 - 65',
                                                                                              '66 - 75','Over 75'
                                                                                                ])

values1=['Pedestrian',
'Cyclist',
'Motorcycle 50cc and under rider or passenger',
'Motorcycle 125cc and under rider or passenger',
'Motorcycle over 125cc and up to 500cc rider or  passenger',
'Motorcycle over 500cc rider or passenger',
'Taxi/Private hire car occupant',
'Car occupant',
'Minibus (8 - 16 passenger seats) occupant',
'Bus or coach occupant (17 or more pass seats)',
'Horse rider',
'Agricultural vehicle occupant',
'Tram occupant',
'Van / Goods vehicle (3.5 tonnes mgw or under) occupant',
'Goods vehicle (over 3.5t. and under 7.5t.) occupant',
'Goods vehicle (7.5 tonnes mgw and over) occupant',
'Mobility scooter rider',
'Electric motorcycle rider or passenger',
'Other vehicle occupant',
'Motorcycle - unknown cc rider or passenger',
'Goods vehicle (unknown weight) occupant',
'Motorcycle - Scooter rider or passenger',
'Motorcycle rider or passenger',
'Motorcycle - Combination rider or passenger',
'Motorcycle over 125cc rider or passenger',
'Taxi (excluding private hire cars) occupant',
'Car occupant (including private hire cars)',
'Minibus/Motor caravan occupant',
'Goods vehicle (over 3.5 tonnes) occupant']
values2 = [0,1,2,3,4,5,8,9,10,11,16,17,18,19,
20,21,22,23,90,97,98,103,104,105,
106,108,109,110,113]

values3 = [1,2,3,10,11,17,19,20,
21,90,103,104,105,
106,108,109,110,113]

values4 = ['Pedal cycle',
'Motorcycle 50cc and under',
'Motorcycle 125cc and under',
'Minibus (8 - 16 passenger seats)',
'Bus or coach (17 or more pass seats)',
'Agricultural vehicle',
'Van / Goods 3.5 tonnes mgw or under',
'Goods over 3.5t. and under 7.5t',
'Goods 7.5 tonnes mgw and over',
'Other vehicle',
'Motorcycle - Scooter',
'Motorcycle',
'Motorcycle - Combination',
'Motorcycle over 125cc',
'Taxi (excluding private hire cars)',
'Car (including private hire cars)',
'Minibus/Motor caravan',
'Goods vehicle over 3.5 tonnes']



df_merged.Casualty_Type = df_merged.Casualty_Type.replace(values2,values1)


df_merged.Vehicle_Type = df_merged.Vehicle_Type.replace(values3,values4)

df_merged.Sex_of_Driver = df_merged.Sex_of_Driver.replace([1,2],['Male','Female'])

df_merged.Age_Band_of_Driver = df_merged.Age_Band_of_Driver.replace([1,2,3,4,5,6,7,8,9,10,11],['0 - 5','6 - 10','11 - 15',
                                                                                                 '16 - 20','21 - 25','26 - 35',
                                                                                              '36 - 45','46 - 55','56 - 65',
                                                                                              '66 - 75','Over 75'
                                                                                                ])

In [None]:
dfh = df_merged.groupby(['Vehicle_Type','Casualty_Type'])['Acc_Index'].count().reset_index()
dfh.pivot(index='Vehicle_Type', columns=['Casualty_Type'], values='Acc_Index').iplot(kind="heatmap"
                                                                                ,colorscale="Blues",
                                                                                 dimensions=(1000,800),
                                                                                  title='Heatmap Vehicle type vs Casualty type')



In [None]:
df_merged['Vehicle_Type'].value_counts(normalize=True)

In [None]:
driver = df_merged.groupby(['Age_Band_of_Driver','Sex_of_Driver'])['Acc_Index'].count().reset_index()
driver['%'] = (driver['Acc_Index']/driver['Acc_Index'].sum()*100).sort_values(ascending=True).astype(str).str[:4]+'%'

import plotly.express as px

fig1 = px.sunburst(driver, path=['Sex_of_Driver','Age_Band_of_Driver','%'], color='Sex_of_Driver')
fig1.update_layout(height=500, width=550, title_text="Age_Band_of_Driver vs Sex of Driver")
fig1.show()
#PORCENTAJE DRIVERS IMPLICADOS

In [None]:
'''casualty = df_merged.groupby(['Age_Band_of_Casualty','Sex_of_Casualty'])['Acc_Index'].count().reset_index
casualty['Percentage'] = (casualty['Acc_Index']/casualty['Acc_Index'].sum()*100).sort_values(ascending=True)


#y de esto
#casualty = casualty.sort_values(by='Percentage',ascending=True)'''

In [None]:
fig,ax=plt.subplots(figsize=(12,6))
sns.barplot(data=casualty,x='Age_Band_of_Casualty',y='Acc_Index',hue='Sex_of_Casualty')
ax.set_title('Age_Band_of_Casualty', fontsize=21, fontweight='bold')
ax.set_xlabel('\n Month',fontsize=15)
ax.set_ylabel('Total Count\n',fontsize=15)

In [None]:
df_merged['Engine_Capacity_(CC)'].value_counts()

In [None]:
df_merged['Propulsion_Code'].value_counts()

In [None]:
df_merged['Age_of_Vehicle'].value_counts()

In [None]:
#df_merge.Engine_Capacity_(CC) = df_merge.Engine_Capacity_(CC).replace([1,2],['Male','Female']) -->drop column
veh_type = df_merge.groupby(['Sex_of_Driver','Vehicle_Type'])['Acc_Index'].count().reset_index()
veh_type['Percentage'] = (veh_type['Acc_Index']/veh_type['Acc_Index'].sum()*100).sort_values(ascending=True)#.astype(str).str[:4]+'%'

#FALTA PLOT

In [None]:
#Front END

In [None]:
%%writefile app.py

accidents = 'accidents_clean.csv'
cas_veh = 'df_merged_cat.csv'
path ='../../'

import streamlit as st
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
cmap = cm.get_cmap
import seaborn as sns

import io
pd.set_option('display.max_columns', None)
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import streamlit as st
import cufflinks as cf
cf.set_config_file(theme='pearl',sharing='public',offline=True)

st.title("Accidents Visualization and Predictions")
st.markdown(
"""
This app is for visualizing the Accidents data for UK which is collected 
from the Academictorrents site https://academictorrents.com/details/c7d2d7a91ae3fd0256dd2ba2d7344960cb3c4dbb.

User can view EDA and predictions for every year and also for a global view.
"""
)

menu=['HOME','EDA','MODELLING']
choice=st.sidebar.selectbox('Menu',menu)
if choice=='HOME':
    #st.subheader('OTRO SUBHEADER')

    # Mostrar tablas de datos
    st.subheader("Datos utilizados")

    dfa = pd.read_csv(path + accidents,delimiter=',',encoding='UTF-8-SIG',index_col=0,nrows=100000)
    dfm = pd.read_csv(path + cas_veh,delimiter=',',encoding='UTF-8-SIG',index_col=0,nrows=100000)
    st.write('Accidents_csv')
    st.dataframe(dfa.head(5))
    st.write('Vehicles_Casualties_csv')
    st.dataframe(dfm.head(5))

fig = dfa['Accident_Severity'].value_counts(normalize=True)\
                    .reset_index().iplot(kind='pie',
                    labels='index',values='Accident_Severity',
                    textinfo='percent+label',hole=0.4,
                    color = ['lightgreen', 'orange','red'],title='Accident Severity Chart',
                    asFigure=True,   )                
                                        
fig.update_layout(legend=dict()

                 
                 )
st.plotly_chart(fig)




In [None]:
!streamlit run app.py

In [None]:


%%writefile app2.py

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt

st.title('Awesome streamlit app for kschool')

# Mostrar tablas de datos

brain=pd.read_csv("https://raw.githubusercontent.com/KaonToPion/datasets/main/brain.csv")

st.dataframe(brain)

# Texto:
st.markdown("""
## texto de ejemplo


otras cosas....
""")

st.text('solo soy un text')


fig,ax=plt.subplots(figsize=(3,3))
ax.scatter(brain['Body Weight'],brain['Brain Weight'])

st.pyplot(fig)


######ALTAIR#####


hist_brain = alt.Chart(brain).mark_bar().encode(
    x=alt.X('Brain Weight',bin=alt.Bin(maxbins=100)),
    y="count()"
).properties(
    width=300,
    height=150,
    title="Relación peso del cerebro y del cuerpo"
).interactive()

hist_body = alt.Chart(brain).mark_bar().encode(
    x=alt.X('Body Weight',bin=alt.Bin(maxbins=100)),
    y="count()"
).properties(
    width=300,
    height=150,
    title="Relación peso del cerebro y del cuerpo"
).interactive()

scatter_brain_body = alt.Chart(brain).mark_circle().encode(
    x='Body Weight',
    y='Brain Weight'
).properties(
    width=700,
    height=300,
    title="Relación peso del cerebro y del cuerpo"
).interactive()

comp_brain = (hist_brain|hist_body)&scatter_brain_body
comp_brain




In [None]:
!streamlit run app2.py