In [None]:
'''
ΙΩΑΝΝΗΣ ΚΩΝΣΤΑΝΤΑΚΟΣ mpked2215 
ΓΕΩΡΓΙΟΣ ΛΕΒΑΝΤΗΣ mpked2216

Η παρακάτω εργασία είναι η δεύτερη που δουλεύουμε και στέλνουμε και πραγματοποιήθηκε διότι στην 
πρώτη δεν βρήκαμε ουσία στο τρίτο ερώτημα (Classification) και στην ουσία όπως περιγράφουμε
αντι για classification κάναμε ένα recommendation system το οποίο δεν ξέρουμε με σιγουριά αν
καλύπτει το ερώτημα στο 100%.
Για την παρούσα εργασία θελήσαμε να εργαστούμε σε ολόκληρο το dataset για να προσεγγισουμε την
πραγματικότητα αλλά βρήκαμε αρκετές δυσκολίες λόγω αυτού.

'''

### IMPORT LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import csv
import seaborn
from matplotlib import pyplot as plt
from matplotlib.ticker import StrMethodFormatter
plt.style.use('dark_background')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

### READ DATA

In [None]:
data = pd.read_csv('/Users/levantisgio/Desktop/accidents.csv')#, nrows=100000)

In [None]:
### Random Sampling
#subset = data.sample(n=100000)

### Export this DataFrame to a csv for easy access.
#subset.to_csv(r'/Users/levantisgio/Desktop/Sample_US_Accidents_Dec21.csv', index=False, header=True)  


###  DATA PREPROCESSING / EXPLORATION 

In [None]:
data.head(5)

In [None]:
data.tail()

In [None]:
# check the no. of columns & rows
print('The Sample Dataset Contains, Rows: {:,d} & Columns: {}'
      .format(data.shape[0], data.shape[1]))

In [None]:

data.columns

In [None]:
# Next, we need an overview of the data types for each column-
data.info()

In [None]:
# The above list ends by saying that there are 3 bool, 13 float64, 1 int64, 20 objects.

##### SUMMARY STATISTICS

In [None]:
data.describe() 
'''
The data has 14 numeric columns while the rest are categorical variables.
'''

In [None]:
# We need the total numbers for each column in descending order
data.isna().sum().sort_values(ascending = False)

In [None]:
#Create variable to present the percentages of every column with missing values
missing_percentages = data.isna().sum().sort_values(ascending = False)/len(data)

#sorting out the ones with zeros
missing_percentages[missing_percentages != 0]



In [None]:
missing_percentages[missing_percentages != 0].plot.barh(x='lab', y='val')

In [None]:
fig=seaborn.heatmap(data[['Severity','Start_Lat','End_Lat','Distance(mi)','Temperature(F)','Wind_Chill(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)']].corr(),annot=True,cmap='RdYlGn',linewidths=0.2,annot_kws={'size':15})
fig=plt.gcf()
fig.set_size_inches(18,15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
# Exclude unnecessary columns
exclude = ["End_Lat","End_Lng","Precipitation(in)","Description","Number","Street","Timezone",
           "Airport_Code","Weather_Timestamp","Civil_Twilight",
           "Nautical_Twilight","Astronomical_Twilight"]
data = data.drop(exclude,axis=1)

In [None]:
'''
To prepare the dataset for further analysis, some additional columns are added:

Time_Diff: Time difference between start time and end time of the accident
Year: Year of start time
Month: Month of start time
Day: Day of start time
Hour: Hour of start time
'''
# To see the trends of accidents over the years, months klp

data['acc_year'] = pd.to_datetime(data['Start_Time']).dt.year
data['acc_month'] = pd.to_datetime(data['Start_Time']).dt.month
data['acc_hr_day'] = pd.to_datetime(data['Start_Time']).dt.hour

data['new_date'] = pd.to_datetime(data['Start_Time']).dt.date

In [None]:
data['day_name'] = pd.to_datetime(data['Start_Time']).dt.day_name()

In [None]:
# Features' Density

numeric_feats = ['Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Severity']

fig, ax = plt.subplots(3,3, figsize=(20,20)) 
ax_flat = ax.flatten() 

for idx, feat in enumerate(numeric_feats):
    data[feat].plot.density(ax=ax_flat[idx])
    ax_flat[idx].set_title(feat)

In [None]:
data.Severity.unique()

In [None]:
data.Severity.value_counts(sort=True)

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
data['Severity'].value_counts().plot.pie(explode=[0,0.1,0.1,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
ax[0].set_title('Percentage Severity Distribution')
ax[0].set_ylabel('Count')
seaborn.countplot('Severity',data=data,ax=ax[1],order=data['Severity'].value_counts().index)
ax[1].set_title('Count of Severity')
plt.show()

In [None]:
data[data.columns[1:]].corr()['Severity'][:]

##### Seeing which counties have the highest number of accidents

In [None]:
data.groupby('County')['ID'].count().reset_index(name = 'count').sort_values(by = 'count', ascending = False)

##### Getting mean weather conditions for the counties with top accident counts

In [None]:
temp = data.groupby('County')['ID', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Speed(mph)'].agg({'ID':'count', 'Temperature(F)':'mean', 'Wind_Chill(F)':'mean', 'Humidity(%)':'mean', 'Pressure(in)':'mean',
       'Visibility(mi)':'mean', 'Wind_Speed(mph)':'mean'}).reset_index().sort_values(by = 'ID', ascending = False)

In [None]:
temp[temp['ID']>20000].head(20)


#### Checking the number of accidents per state

In [None]:
'''
- Data Analysis and Visualization:

Once the data is cleaned and prepped for analysis, we take some of the columns
and apply statistical methods to see the underlying picture come to the surface. 
The following columns have been analyzed below:

State, City, Start Time, Temperature, Weather Condition, 
Visibility, Crossing, Traffic Signal and Bump.

'''

In [None]:
fig,ax=plt.subplots(1,2,figsize=(15,8))
clr = ("blue", "green", "grey", "red", "purple",'pink','yellow','orange','darkblue','maroon')
data.State.value_counts().sort_values(ascending=False)[:10].sort_values().plot(kind='barh',color=clr,ax=ax[0])
ax[0].set_title("Top 10 Acciedent Prone States",size=20)
ax[0].set_xlabel('States',size=18)


count=data['State'].value_counts()
groups=list(data['State'].value_counts().index)[:10]
counts=list(count[:10])
counts.append(count.agg(sum)-count[:10].agg('sum'))
groups.append('Other')
type_dict=pd.DataFrame({"group":groups,"counts":counts})
clr1=('brown','darksalmon','orange','hotpink','cadetblue','purple','red','gold','forestgreen','blue','plum')
qx = type_dict.plot(kind='pie', y='counts', labels=groups,colors=clr1,autopct='%1.1f%%', pctdistance=0.9, radius=1.2,ax=ax[1])
plt.legend(loc=0, bbox_to_anchor=(1.15,0.4)) 
plt.subplots_adjust(wspace =0.5, hspace =0)
plt.ioff()
plt.ylabel('')

#### Checking the number of accidents per City

In [None]:
fig, ax=plt.subplots(figsize=(16,7))
data['City'].value_counts().sort_values(ascending=False).head(20).plot.bar(width=0.5,edgecolor='k',align='center',linewidth=2)
plt.xlabel('Cities',fontsize=20)
plt.ylabel('Number of Accidents',fontsize=20)
ax.tick_params(labelsize=20)
plt.title('Top 20 Cities with the most accidents',fontsize=25)
plt.grid()
plt.ioff()

#### Checking the weather conditions that are most common

In [None]:
fig, ax=plt.subplots(figsize=(16,7))
data['Weather_Condition'].value_counts().sort_values(ascending=False).head(5).plot.bar(width=0.5,edgecolor='k',align='center',linewidth=2)
plt.xlabel('Weather_Condition',fontsize=20)
plt.ylabel('Number of Accidents',fontsize=20)
ax.tick_params(labelsize=20)
plt.title('5 Top Weather Condition for accidents',fontsize=25)
plt.grid()
plt.ioff()

### Exploring the target feature which is Severity


#### At what time are the accidents most common along with severity

In [None]:
hours = list(set(pd.to_datetime(data['Start_Time']).dt.hour))
severity_1_hours = []
severity_2_hours = []
severity_3_hours = []
severity_4_hours = []
for i in hours:
    severity_1_hours.append(data[(data['Severity']==1)&(data['acc_hr_day']==i)].count()['ID'])
    severity_2_hours.append(data[(data['Severity']==2)&(data['acc_hr_day']==i)].count()['ID'])
    severity_3_hours.append(data[(data['Severity']==3)&(data['acc_hr_day']==i)].count()['ID'])
    severity_4_hours.append(data[(data['Severity']==4)&(data['acc_hr_day']==i)].count()['ID'])

In [None]:
plt.figure(figsize=(20,15))

plt.bar(hours, severity_2_hours, label='Severity 2')
plt.bar(hours, severity_3_hours, label='Severity 3')
plt.bar(hours, severity_4_hours, label='Severity 4')
plt.bar(hours, severity_1_hours, label='Severity 1')


plt.legend()

#### Severity with state

In [None]:
states = data.State.unique()

In [None]:
count_by_state=[]
for i in data.State.unique():
    count_by_state.append(data[data['State']==i].count()['ID'])

In [None]:
severity_1_state = []
severity_2_state = []
severity_3_state = []
severity_4_state = []
for i in states:
    severity_1_state.append(data[(data['Severity']==1)&(data['State']==i)].count()['ID'])
    severity_2_state.append(data[(data['Severity']==2)&(data['State']==i)].count()['ID'])
    severity_3_state.append(data[(data['Severity']==3)&(data['State']==i)].count()['ID'])
    severity_4_state.append(data[(data['Severity']==4)&(data['State']==i)].count()['ID'])

In [None]:
# In almost all of the states, accidents of severity 2 are the highest followed by severity 3

plt.figure(figsize=(20,15))

plt.bar(states, severity_2_state, label='Severity 2')
plt.bar(states, severity_3_state, label='Severity 3')
plt.bar(states, severity_4_state, label='Severity 4')
plt.bar(states, severity_1_state, label='Severity 1')


plt.legend()

#### Severity with temperature

In [None]:
Temperature = data['Temperature(F)']
severity_1_data = data[data['Severity']==1]['Temperature(F)'].mean()
severity_2_data = data[data['Severity']==2]['Temperature(F)'].mean()
severity_3_data = data[data['Severity']==3]['Temperature(F)'].mean()
severity_4_data = data[data['Severity']==4]['Temperature(F)'].mean()
severity_labels = ['Severity 1', 'Severity 2', 'Severity 3', 'Severity 4']

Mean_temp_by_severity = [severity_1_data, severity_2_data, severity_3_data, severity_4_data]

In [None]:
plt.figure(figsize=(16, 6))
seaborn.barplot(severity_labels, Mean_temp_by_severity)
plt.grid(color='black', linestyle='-', linewidth=1, alpha=0.3)

#### Severity with weather

In [None]:
Weather = data.Weather_Condition.value_counts()

In [None]:
severity_1_Weather = []
severity_2_Weather = []
severity_3_Weather = []
severity_4_Weather = []
for i in Weather.index:
    severity_1_Weather.append(data[(data['Severity']==1)&(data['Weather_Condition']==i)].count()['ID'])
    severity_2_Weather.append(data[(data['Severity']==2)&(data['Weather_Condition']==i)].count()['ID'])
    severity_3_Weather.append(data[(data['Severity']==3)&(data['Weather_Condition']==i)].count()['ID'])
    severity_4_Weather.append(data[(data['Severity']==4)&(data['Weather_Condition']==i)].count()['ID'])

In [None]:
plt.figure()
plt.xticks(rotation=90)
plt.bar(Weather.index[:10], severity_2_Weather[:10], label='Severity 2')
plt.bar(Weather.index[:10], severity_3_Weather[:10], label='Severity 3')
plt.bar(Weather.index[:10], severity_4_Weather[:10], label='Severity 4')
plt.bar(Weather.index[:10], severity_1_Weather[:10], label='Severity 1')
plt.legend()

In [None]:
plt.figure()
plt.xticks(rotation=90)
plt.bar(Weather.index[10:20], severity_2_Weather[10:20], label='Severity 2')
plt.bar(Weather.index[10:20], severity_3_Weather[10:20], label='Severity 3')
plt.bar(Weather.index[10:20], severity_4_Weather[10:20], label='Severity 4')
plt.bar(Weather.index[10:20], severity_1_Weather[10:20], label='Severity 1')
plt.legend()

In [None]:
percentage_severity_1 = []
percentage_severity_2 = []
percentage_severity_3 = []
percentage_severity_4 = []
for i in range(len(severity_1_Weather)):
    percentage_severity_1.append((severity_1_Weather[i]/Weather[i])*100)
    percentage_severity_2.append((severity_2_Weather[i]/Weather[i])*100)
    percentage_severity_3.append((severity_3_Weather[i]/Weather[i])*100)
    percentage_severity_4.append((severity_4_Weather[i]/Weather[i])*100)

In [None]:
# Here we can note that extreme weather conditions such Lignt Snow Showers,
# thunders, light ice pallets, heavy thunderstorms etc. have a very high severity of accidents 
# as compared to relatively normal conditions

plt.figure(figsize=(20,10))
plt.xticks(fontsize=11, rotation=90)
plt.bar(Weather.index, percentage_severity_2, label='Severity 2')
plt.bar(Weather.index, percentage_severity_3, label='Severity 3')
plt.bar(Weather.index, percentage_severity_4, label='Severity 4')
plt.bar(Weather.index, percentage_severity_1, label='Severity 1')
plt.legend(loc=1, prop={'size': 11})

#### Severity by Year

In [None]:
# Examine data
data.groupby(["acc_year","Severity"]).size().unstack()

In [None]:
Years = data.acc_year.unique()

In [None]:
count_by_acc_year=[]
for i in data.acc_year.unique():
    count_by_acc_year.append(data[data['acc_year']==i].count()['ID'])

In [None]:
severity_1 = []
severity_2 = []
severity_3 = []
severity_4 = []
for i in Years:
    severity_1.append(data[(data['Severity']==1)&(data['acc_year']==i)].count()['ID'])
    severity_2.append(data[(data['Severity']==2)&(data['acc_year']==i)].count()['ID'])
    severity_3.append(data[(data['Severity']==3)&(data['acc_year']==i)].count()['ID'])
    severity_4.append(data[(data['Severity']==4)&(data['acc_year']==i)].count()['ID'])

#data.groupby(["acc_year","Severity"]).size().unstack().plot(kind='bar',stacked=True)

In [None]:
plt.figure(figsize=(15,10))

plt.bar(Years, severity_2, label='Severity 2')
plt.bar(Years, severity_3, label='Severity 3')
plt.bar(Years, severity_4, label='Severity 4')
plt.bar(Years, severity_1, label='Severity 1')


plt.legend()

### CLUSTERING

In [None]:
acc = pd.read_csv('/Users/levantisgio/Desktop/accidents.csv')#, nrows=100000)#sbise to nrows

In [None]:
acc=acc.drop(['ID','Start_Lat','Start_Lng','End_Lat','Start_Time','End_Lng','End_Time','Precipitation(in)','Description','Number','Street','Side','City','Country','State','Zipcode','County','Timezone','Airport_Code','Weather_Timestamp','Temperature(F)','Wind_Chill(F)','Wind_Direction','Amenity','Bump','Crossing','Give_Way','Junction','Traffic_Calming','Traffic_Signal','No_Exit','Railway','Turning_Loop','Roundabout','Station','Stop','Sunrise_Sunset','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight'],axis=1)

In [None]:
acc 
#na kanw fill NaN me avg?

In [None]:
acc['Weather_Condition'].unique() #mi aksiopoihsimh

In [None]:
acc=acc.drop('Weather_Condition',axis=1)

In [None]:
print(acc['Severity'].value_counts())


In [None]:
acc.info()

In [None]:
seaborn.countplot(x='Severity',data=acc)

In [None]:
acc=acc.dropna() 

In [None]:
myVal=acc['Severity']

In [None]:
myVal

In [None]:
acc=acc.drop('Severity',axis=1)

In [None]:
acc

#### KANONIKOPOIHSH

In [None]:
scaler = MinMaxScaler()

In [None]:
mydata=scaler.fit_transform(acc)

In [None]:
mydata = pd.DataFrame(scaler.fit_transform(acc.values), columns=acc.columns, index=acc.index)

#### K-MEANS

In [None]:
machine=KMeans(n_clusters=4)#SOS na ginei 4 otan mpoun ola ta dedomena

In [None]:
machine.fit(mydata)

In [None]:
machine.labels_

In [None]:
machine.cluster_centers_

In [None]:

plt.scatter(mydata['Distance(mi)'],mydata['Humidity(%)'],c=machine.labels_,cmap='rainbow')
plt.title('K-MEANS')

In [None]:
mydata['Severity']=myVal

In [None]:
plt.scatter(mydata['Distance(mi)'],mydata['Humidity(%)'],c=mydata['Severity'],cmap='rainbow')
plt.title('Original')

#### DBSCAN

In [None]:
machine = DBSCAN(eps=0.3, min_samples=5)

In [None]:
mydata=mydata.drop('Severity',axis=1)

In [None]:
machine.fit(mydata)

In [None]:
DBlabels = machine.labels_

In [None]:
DBlabels

In [None]:
matplotlib.pyplot.scatter(mydata['Distance(mi)'],mydata['Humidity(%)'],c=DBlabels,cmap='rainbow')
matplotlib.pyplot.title('DB-SCAN')

In [None]:
matplotlib.pyplot.scatter(mydata['Visibility(mi)'],mydata['Distance(mi)'],c=DBlabels,cmap='rainbow')
matplotlib.pyplot.title('DB-SCAN')

### CLASSIFICATION

##### Train Test Split     33-66

In [None]:
X_train, X_test, y_train, y_test = train_test_split(mydata, myVal, test_size=0.33, random_state=42)

In [None]:
#splitarw to teliko set kai ftiaxnw mia machine gia logistic regression 
#giati auto pou thelw einai mia mixani entaksis se klasi(to severity exei 4 classes)

#### LOGISTIC REGRESSION

In [None]:
machine = LogisticRegression()

In [None]:
machine.fit(X_train,y_train)

In [None]:
predictions = machine.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
print(confusion_matrix(y_test,predictions))

#### K_NEAREST (DEN TREXEI)

In [None]:
# machine = KNeighborsClassifier(n_neighbors = 15)

In [None]:
# machine.fit(X_train,y_train)

In [None]:
# predictions  = machine.predict(X_test)

In [None]:
# print(classification_report(y_test,predictions))

In [None]:
# print(confusion_matrix(y_test,predictions))

#### DECISION TREE

In [None]:
machine = DecisionTreeClassifier()

In [None]:
machine.fit(X_train,y_train)

In [None]:
predictions = machine.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
print(confusion_matrix(y_test,predictions))

#### RANDOM FOREST (DEN TREXEI)

In [None]:
# machine = RandomForestClassifier(n_estimators=100)

In [None]:
# machine.fit(X_train, y_train)

In [None]:
# predictions = machine.predict(X_test)

In [None]:
# print(classification_report(y_test,predictions))

In [None]:
# print(confusion_matrix(y_test,predictions))

 #### SVM (DEN TREXEI) 

In [None]:
#machine = SVC()

In [None]:
#machine.fit(X_train,y_train)

In [None]:
#predictions = machine.predict(X_test)

In [None]:
#print(classification_report(y_test,predictions))

In [None]:
#print(confusion_matrix(y_test,predictions))