In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
X_train = pd.read_csv('/kaggle/input/bigquery-geotab-intersection-congestion/train.csv')
X_test = pd.read_csv('/kaggle/input/bigquery-geotab-intersection-congestion/test.csv')

In [None]:
X_train.head()

In [None]:
X_train.columns

In [None]:
X_train.nunique()

In [None]:
X_train.isna().sum()

**Add feature :**

Cardinal direction from N, W, E, S to $\frac{\theta}{\pi}$



In [None]:
directions = {
    'N': 0,
    'NE': 1/4,
    'E': 1/2,
    'SE': 3/4,
    'S': 1,
    'SW': 5/4,
    'W': 3/2,
    'NW': 7/4
}

In [None]:
X_train['EntryHeading'] = X_train['EntryHeading'].map(directions)
X_train['ExitHeading'] = X_train['ExitHeading'].map(directions)

X_test['EntryHeading'] = X_test['EntryHeading'].map(directions)
X_test['ExitHeading'] = X_test['ExitHeading'].map(directions)

In [None]:
X_train['diffHeading'] = X_train['EntryHeading']-X_train['ExitHeading']  # TODO - check if this is right. For now, it's a silly approximation without the angles being taken into consideration

X_test['diffHeading'] = X_test['EntryHeading']-X_test['ExitHeading']  # TODO - check if this is right. For now, it's a silly approximation without the angles being taken into consideration
                              
X_train[['ExitHeading','EntryHeading','diffHeading']].drop_duplicates().head(20)

In [None]:
X_train["same_street_exact"] = (X_train["EntryStreetName"] ==  X_train["ExitStreetName"]).astype(int)
X_test["same_street_exact"] = (X_test["EntryStreetName"] ==  X_test["ExitStreetName"]).astype(int)

In [None]:
le = preprocessing.LabelEncoder()
# le = preprocessing.OneHotEncoder(handle_unknown="ignore") # will have all zeros for novel categoricals, [can't do drop first due to nans issue , otherwise we'd  drop first value to avoid colinearity

In [None]:
X_train["Intersection"] = X_train["IntersectionId"].astype(str) + X_train["City"]
X_test["Intersection"] = X_test["IntersectionId"].astype(str) + X_test["City"]

print(X_train["Intersection"].sample(6).values)

In [None]:
pd.concat([X_train["Intersection"],X_test["Intersection"]],axis=0).drop_duplicates().values

In [None]:
le.fit(pd.concat([X_train["Intersection"],X_test["Intersection"]]).drop_duplicates().values)
X_train["Intersection"] = le.transform(X_train["Intersection"])
X_test["Intersection"] = le.transform(X_test["Intersection"])

In [None]:
pd.get_dummies(X_train["City"],dummy_na=False, drop_first=False).head()

In [None]:
X_train = pd.concat([X_train,pd.get_dummies(X_train["City"],dummy_na=False, drop_first=False)],axis=1).drop(["City"],axis=1)
X_test = pd.concat([X_test,pd.get_dummies(X_test["City"],dummy_na=False, drop_first=False)],axis=1).drop(["City"],axis=1)

In [None]:
X_train.columns

In [None]:
FEAT_COLS = ["IntersectionId",
             'Intersection',
           'diffHeading',  'same_street_exact',
           "Hour","Weekend","Month",
          'Latitude', 'Longitude',
          'EntryHeading', 'ExitHeading',
            'Atlanta', 'Boston', 'Chicago',
       'Philadelphia']

In [None]:
# X = train[["IntersectionId","Hour","Weekend","Month",'en_E',
#        'en_N', 'en_NE', 'en_NW', 'en_S', 'en_SE', 'en_SW', 'en_W', 'ex_E',
#        'ex_N', 'ex_NE', 'ex_NW', 'ex_S', 'ex_SE', 'ex_SW', 'ex_W']]

X = X_train[FEAT_COLS]
y1 = X_train["TotalTimeStopped_p20"]
y2 = X_train["TotalTimeStopped_p50"]
y3 = X_train["TotalTimeStopped_p80"]
y4 = X_train["DistanceToFirstStop_p20"]
y5 = X_train["DistanceToFirstStop_p50"]
y6 = X_train["DistanceToFirstStop_p80"]

In [None]:
X_test_final = X_test[FEAT_COLS]

In [None]:
## kaggle kernel eprformance can be very unstable when trying to use miltuiprocessing
# lr = LinearRegression()
lr = RandomForestRegressor(n_estimators=128,min_samples_split=3) #different default hyperparams, not necessarily any better

In [None]:
lr.fit(X,y1)
pred1 = lr.predict(X_test_final)
lr.fit(X,y2)
pred2 = lr.predict(X_test_final)
lr.fit(X,y3)
pred3 = lr.predict(X_test_final)
lr.fit(X,y4)
pred4 = lr.predict(X_test_final)
lr.fit(X,y5)
pred5 = lr.predict(X_test_final)
lr.fit(X,y6)
pred6 = lr.predict(X_test_final)

In [None]:
# Appending all predictions
all_preds = []
for i in range(len(pred1)):
    for j in [pred1,pred2,pred3,pred4,pred5,pred6]:
        all_preds.append(j[i])

In [None]:
sub  = pd.read_csv('/kaggle/input/bigquery-geotab-intersection-congestion/sample_submission.csv')

In [None]:
sub["Target"] = all_preds

In [None]:
sub.to_csv("submission.csv",index = False)