# Train Data Cleaning

In [66]:
import pandas as pd

In [67]:
def trainClean(path):
    train_df = pd.read_csv(path)
    train_df.drop(columns=['Bump','Roundabout'],axis=1,inplace=True)
    train_df['RightLane'] = train_df['Side'].apply(lambda x: 1 if x == 'R' else 0)
    train_df.drop('Side',axis=1,inplace=True)
    for c in train_df.columns:
        if train_df[c].dtype =='bool':
            train_df[c] = train_df[c] * 1
    train_df['date'] = train_df.timestamp.apply(lambda x: x.split()[0])
    train_df['time'] = train_df.timestamp.apply(lambda x: x.split()[1])
    train_df.drop('timestamp',axis=1,inplace=True)
    return train_df

In [68]:
def fact_time(x):
    if int(x.split(':')[1])>30:
        x = int(x.split(':')[0])+1
    else:
        x = int(x.split(':')[0])

    if 3 <= x < 9:
        return 0
    elif 9 <= x < 15:
        return 1
    elif 15 <= x < 21:
        return 2
    else:
        return 3

train_df = trainClean('train.csv')
train_df['time'] = train_df.time.apply(fact_time)
train_df.head()

Unnamed: 0,ID,Lat,Lng,Distance(mi),Crossing,Give_Way,Junction,No_Exit,Railway,Stop,Amenity,Severity,RightLane,date,time
0,0,37.76215,-122.40566,0.044,0,0,0,0,0,0,1,2,1,2016-03-25,2
1,1,37.719157,-122.448254,0.0,0,0,0,0,0,0,0,2,1,2020-05-05,2
2,2,37.808498,-122.366852,0.0,0,0,0,0,0,1,0,3,1,2016-09-16,2
3,3,37.78593,-122.39108,0.009,0,0,1,0,0,0,0,1,1,2020-03-29,2
4,4,37.719141,-122.448457,0.0,0,0,0,0,0,0,0,2,1,2019-10-09,1


# Weather Data Cleaning

In [69]:
def single_n(n):
    n = str(n)
    if len(n)<2:
        return '0'+n
    return n

def weatherClean(path):
    weather_df = pd.read_csv(path)
    weather_df['Date']=weather_df['Year'].astype(str)+'-'+weather_df['Month'].apply(single_n)+'-'+weather_df['Day'].apply(single_n)
    weather_df['Time']=weather_df['Hour'].astype(str)+':00'
    
    weather_df = weather_df.drop(columns=['Month','Day','Year','Hour','Selected'])
    
    weather_df.fillna(weather_df.mean(),inplace=True)
    
    weather_df['Weather_Condition'] = weather_df.Weather_Condition.apply(lambda x: 'Rain' if any(i in str(x) for i in ['Rain', 'Drizzle', 'Squalls']) else x)
    weather_df['Weather_Condition'] = weather_df.Weather_Condition.apply(lambda x: 'Fog' if any(i in str(x) for i in ['Fog', 'Haze', 'Smoke', 'Mist']) else x)
    weather_df['Weather_Condition'] = weather_df.Weather_Condition.apply(lambda x: 'Cloud' if any(i in str(x) for i in ['Cloud', 'Overcast']) else x)
    weather_df['Weather_Condition'] = weather_df.Weather_Condition.apply(lambda x: 'Clear' if any(i in str(x) for i in ['Clear', 'Fair']) else x)
    labels, uniques = pd.factorize(weather_df['Weather_Condition'])
    weather_df['Weather_Condition'] = labels
    
    weather_df['Time'] = weather_df.Time.apply(fact_time)
    weather_df.drop_duplicates(subset=['Date'],inplace=True)

    return weather_df

train_weather_df = weatherClean('weather-data.csv')
train_weather_df.head()

  weather_df.fillna(weather_df.mean(),inplace=True)


Unnamed: 0,Weather_Condition,Wind_Chill(F),Precipitation(in),Temperature(F),Humidity(%),Wind_Speed(mph),Visibility(mi),Date,Time
0,0,64.0,0.0,64.0,70.0,20.0,10.0,2020-07-27,2
1,1,59.762515,0.006444,71.1,57.0,9.2,10.0,2017-09-30,2
2,1,59.762515,0.006444,57.9,87.0,15.0,9.0,2017-06-27,0
3,0,59.762515,0.006444,66.9,73.0,4.6,10.0,2016-09-07,1
4,0,52.0,0.0,52.0,89.0,0.0,9.0,2019-10-19,3


# Holiday Data

In [70]:
# def holidayClean(path):
#     holiday_df = pd.read_xml(path)
#     holiday_df['description'] = 1
#     holiday_df = holiday_df.rename(columns={'description': 'holiday'})
#     return holiday_df

# train_holiday_df = holidayClean('holidays.xml')

In [71]:
def trainFinalMerge(df,wdf):#,hdf):
    final_df = df.merge(wdf,how='left',left_on=['date','time'],right_on=['Date','Time'])
#     final_df = final_df.merge(hdf,how='left',left_on=['Date'],right_on=['date'])
#     final_df['holiday'] = final_df['holiday'].fillna(0)
    dropped_columns = ['Date','date','Give_Way','No_Exit','time','Temperature(F)','Time'
                       ,'Wind_Chill(F)','Humidity(%)','Wind_Speed(mph)','Distance(mi)','Precipitation(in)','Visibility(mi)',
                      'Railway','Weather_Condition']
    final_df = final_df.drop(columns=dropped_columns)
#     final_df = final_df.drop_duplicates('ID')
    final_df.drop(columns=['ID'],inplace=True)
    final_df.fillna(final_df.mean(),inplace=True)
    return final_df

train_final_df = trainFinalMerge(train_df,train_weather_df)#,train_holiday_df)
print(train_final_df.corr()['Severity'])
print(train_final_df.info())

Lat          0.099581
Lng          0.145313
Crossing    -0.090314
Junction    -0.068328
Stop         0.229269
Amenity     -0.078915
Severity     1.000000
RightLane    0.060545
Name: Severity, dtype: float64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6407 entries, 0 to 6406
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Lat        6407 non-null   float64
 1   Lng        6407 non-null   float64
 2   Crossing   6407 non-null   int32  
 3   Junction   6407 non-null   int32  
 4   Stop       6407 non-null   int32  
 5   Amenity    6407 non-null   int32  
 6   Severity   6407 non-null   int64  
 7   RightLane  6407 non-null   int64  
dtypes: float64(2), int32(4), int64(2)
memory usage: 350.4 KB
None


In [72]:
def testFinalMerge(df,wdf):#,hdf):
    final_df = df.merge(wdf,how='left',left_on=['date','time'],right_on=['Date','Time'])
    # final_df = final_df.merge(hdf,how='left',left_on=['Date'],right_on=['date'])
    # final_df['holiday'] = final_df['holiday'].fillna(0)
    final_df.fillna(final_df.mean(),inplace=True)
#     final_df.drop_duplicates(inplace=True)
    dropped_columns = ['Date','date','Give_Way','No_Exit','time','Temperature(F)','Time'
                       ,'Wind_Chill(F)','Humidity(%)','Wind_Speed(mph)','Distance(mi)','Precipitation(in)','Visibility(mi)',
                      'Railway','Weather_Condition']
    final_df = final_df.drop(columns=dropped_columns)
    final_df = final_df.drop_duplicates('ID')
    final_df.fillna(final_df.mean(),inplace=True)
    return final_df

# Model Evaluation


In [73]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(train_final_df, test_size=0.2, random_state=42) # Try adding `stratify` here

x_train = train_df.drop(columns=['Severity'])
y_train = train_df['Severity']

x_test = val_df.drop(columns=['Severity'])
y_test = val_df['Severity']


In [74]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(random_state=42,max_depth=2)
classifier = classifier.fit(x_train, y_train)

In [75]:
print(classifier.score(x_test, y_test))

0.7425897035881436


In [76]:
test_df = trainClean('test.csv')
test_df['time'] = test_df.time.apply(fact_time)
test_weather_df = weatherClean('weather-data.csv')
# test_holiday_df = holidayClean('holidays.xml')
test_final_df = testFinalMerge(test_df,test_weather_df)#,test_holiday_df)
test_final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1601 entries, 0 to 1600
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         1601 non-null   int64  
 1   Lat        1601 non-null   float64
 2   Lng        1601 non-null   float64
 3   Crossing   1601 non-null   int32  
 4   Junction   1601 non-null   int32  
 5   Stop       1601 non-null   int32  
 6   Amenity    1601 non-null   int32  
 7   RightLane  1601 non-null   int64  
dtypes: float64(2), int32(4), int64(2)
memory usage: 87.6 KB


  weather_df.fillna(weather_df.mean(),inplace=True)
  final_df.fillna(final_df.mean(),inplace=True)


In [77]:
test = test_final_df.drop(columns=['ID'])

# You should update/remove the next line once you change the features used for training
y_test_predicted = classifier.predict(test)

test_final_df['Severity'] = y_test_predicted

test_final_df.head()

Unnamed: 0,ID,Lat,Lng,Crossing,Junction,Stop,Amenity,RightLane,Severity
0,6407,37.78606,-122.3909,0,1,0,0,1,2
1,6408,37.769609,-122.415057,0,0,0,0,1,2
2,6409,37.807495,-122.476021,0,0,0,0,1,2
3,6410,37.761818,-122.405869,0,1,0,0,1,2
4,6411,37.73235,-122.4141,0,0,0,0,1,2


In [78]:
test_final_df[['ID', 'Severity']].to_csv('submission.csv', index=False)