In [58]:
from pathlib import Path

import numpy as np
import pandas as pd
# DATA_PATH = Path.cwd().parent / "ML project" 
pd.set_option("display.max_columns", 100)
from sklearn.metrics import f1_score

In [59]:
train_df = pd.read_csv(
     "./train.csv", 
    index_col="tripid"
)



train_df['label'] = train_df['label'].map({"correct":1, "incorrect":0})
train_df_cols = train_df.columns.values
li = list(train_df_cols)
features_df = train_df[li[0:-1]]
labels_df = train_df[li[-1]]

In [60]:
from math import sin, cos, sqrt, atan2, radians
def calculate_distance(**kwargs):
    R = 6373.0

    lat1 = radians(kwargs['pick_lat'])
    lon1 = radians(kwargs['pick_lon'])
    lat2 = radians(kwargs['drop_lat'])
    lon2 = radians(kwargs['drop_lon'])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c

In [61]:
def format_df(dataFrame):
    
    df = dataFrame
    
    #calculatre the distance
    df['distance'] = [calculate_distance(**df[['pick_lat', 'pick_lon', 'drop_lat', 'drop_lon']].iloc[i].to_dict()) for i in range(df.shape[0])]
    
    #calculate the time difference between pickup and drop time in seconds
    df['time_difference'] = (pd.DatetimeIndex(df['drop_time']) - pd.DatetimeIndex(df['pickup_time'])).seconds
    
    #fill the missing values for the duration  
    df.loc[(pd.isnull(df.duration)), 'duration'] = df.time_difference

    #fill the missing values for additional_fare
    df.loc[(pd.isnull(df.additional_fare)), 'additional_fare'] = 10.5
    
    #fill the missing values for meter_waiting_till_pickup
    df.loc[(pd.isnull(df.meter_waiting_till_pickup)), 'meter_waiting_till_pickup'] = 60
    
    #fill the missing values for meter_waiting
    df.loc[(pd.isnull(df.meter_waiting)), 'meter_waiting'] = 120
    
    #fill the missing values for meter_waiting_fare
    df.loc[(pd.isnull(df.meter_waiting_fare)), 'meter_waiting_fare'] = df.meter_waiting * 1.75/60

    #fill the missing values for fare
    df.loc[(pd.isnull(df.fare)), 'fare'] = df.meter_waiting_fare + df.additional_fare + df.distance

    df = df.drop(['pickup_time','drop_time','pick_lat','pick_lon','drop_lat','drop_lon'], axis = 1) 
    


    
    return df

In [62]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

RANDOM_SEED = 6  

In [64]:
format_df(train_df)

# numeric_cols = features_df.columns[features_df.dtypes != "object"].values


ValueError: cannot reindex from a duplicate axis

In [32]:
# chain preprocessing into a Pipeline object
# each step is a tuple of (name you chose, sklearn transformer)
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy='median'))
])

# create the preprocessor stage of final pipeline
# each entry in the transformer list is a tuple of
# (name you choose, sklearn transformer, list of columns)
preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numeric_cols)
    ],
    remainder = "drop"
)

In [33]:
estimators = estimator=LogisticRegression(penalty="l2", C=1)

In [34]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

In [35]:
full_pipeline
None

In [36]:
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED
)

In [54]:
%%time

# Train model
full_pipeline.fit(X_train, y_train)

# Predict on evaluation set
# This competition wants probabilities, not labels
preds = full_pipeline.predict(X_eval)
y_preds = pd.DataFrame(
    preds,
    index = y_eval.index
)
f1 = f1_score(y_eval, y_preds)
print(f1)

0.9501865671641792
Wall time: 316 ms


Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
205667210,10.5,4439.0,1875.0,108.416715,48.0,12/31/2019 22:03,12/31/2019 23:17,7.30277,80.6332,7.27153,80.6334,783.39
193277385,10.5,347.0,2.0,0.000000,118.0,11/19/2019 22:50,11/19/2019 22:55,6.84718,79.9267,6.83017,79.9200,122.44
201668211,10.5,1133.0,121.0,0.000000,128.0,12/18/2019 13:33,12/18/2019 13:52,6.91656,79.8715,6.95076,79.8751,208.80
204263911,,,,,,12/26/2019 14:30,12/26/2019 15:05,7.33119,80.6522,7.27138,80.6748,
206929715,10.5,1280.0,80.0,0.000000,138.0,1/4/2020 10:30,1/4/2020 10:51,6.79248,79.9241,6.84997,79.8958,270.14
...,...,...,...,...,...,...,...,...,...,...,...,...
194581383,10.5,375.0,6.0,0.354339,105.0,11/24/2019 14:44,11/24/2019 14:50,6.91486,79.9785,6.90898,79.9669,102.96
210262618,10.5,707.0,40.0,2.332000,56.0,1/16/2020 20:52,1/16/2020 21:03,6.88960,79.9039,6.89729,79.8926,170.37
193567198,10.5,2081.0,396.0,23.086800,109.0,11/21/2019 8:23,11/21/2019 8:57,6.82230,79.8672,6.89065,79.8743,349.20
195794965,10.5,3728.0,2412.0,139.413600,108.0,11/28/2019 21:13,11/28/2019 22:15,6.71234,79.9060,6.70685,79.9267,362.08


In [38]:
print("test_probas[0].shape", preds.shape)
#print("test_probas[1].shape", preds[1].shape)

test_probas[0].shape (5669,)


In [39]:
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (5669, 1)


Unnamed: 0_level_0,0
tripid,Unnamed: 1_level_1
207281621,1
209841908,1
207398166,1
205322558,1
194991764,1


In [40]:
test_features = pd.read_csv("./test.csv", 
                               index_col="tripid")

In [41]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
189123628,10.5,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32
189125358,10.5,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85
189125719,10.5,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39
...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.5,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26
213812756,10.5,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23
213813930,10.5,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20
213815405,10.5,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31


In [42]:
#test_features = test_features.drop(['tripid'], axis = 1) 
test_features

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
213284604,10.5,924,42,2.44860,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.8750,6.77490,79.8840,289.27
213286352,10.5,4249,20,0.00000,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.70
213293973,10.5,1552,255,2.65880,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.00
213294622,10.5,462,16,0.00000,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
213298687,10.5,814,392,12.36920,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.9130,6.98875,79.8914,147.47
...,...,...,...,...,...,...,...,...,...,...,...,...
222856243,10.5,1723,429,24.83332,3,3/16/2020 21:28,3/16/2020 21:56,6.85103,79.9567,6.85588,79.9214,388.48
222857785,10.5,1378,80,0.00000,125,3/16/2020 21:59,3/16/2020 22:22,6.91293,79.9656,6.92112,79.8980,379.85
222858416,10.5,418,56,3.28440,93,3/16/2020 22:02,3/16/2020 22:09,6.85718,79.9081,6.83868,79.9083,112.79
222858691,10.5,1604,548,31.67440,17,3/16/2020 22:07,3/16/2020 22:34,6.91289,79.8846,6.93159,79.9145,248.46


In [43]:
test_probas = full_pipeline.predict(test_features)
test_preds = pd.DataFrame(
    {'prediction':test_probas},
    index = test_features.index
)
print(test_preds)

           prediction
tripid               
213284604           1
213286352           0
213293973           1
213294622           1
213298687           1
...               ...
222856243           1
222857785           1
222858416           1
222858691           1
222860703           1

[8576 rows x 1 columns]


In [44]:
test_preds.to_csv('./my_new_submission123.csv', index=True)

In [47]:
# submission_df.head()


In [48]:
# # Make sure we have the rows in the same order
# np.testing.assert_array_equal(test_features.index.values, submission_df.index.values)

# # Save predictions to submission data frame
# submission_df["prediction"] = test_probas[0][:, 1]

# submission_df.head()

In [49]:
# test_features.index.values

In [50]:
# submission_df.index.values

In [51]:
# submission_df.to_csv('my_new_submission.csv', index=True)


In [52]:
# submission_df