In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn import preprocessing

from math import radians, cos, sin, asin, sqrt

from datetime import datetime

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
pd.options.display.float_format = "{:.18f}".format

In [7]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.98215484619139,40.76793670654297,-73.96463012695312,40.765602111816406,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.98041534423825,40.738563537597656,-73.99948120117188,40.731151580810554,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.97902679443358,40.763938903808594,-74.00533294677734,40.710086822509766,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004028320312,40.719970703125,-74.01226806640625,40.70671844482422,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.97305297851561,40.79320907592773,-73.9729232788086,40.782520294189446,N,435


In [8]:
# quitar valores vacíos
df.dropna(inplace=True)

In [9]:
mean, std_deviation = np.mean(df['trip_duration']), np.std(df['trip_duration'])
df = df[df['trip_duration'] <= mean + 2 * std_deviation]
df = df[df['trip_duration'] >= mean - 2 * std_deviation]

In [10]:
# crear variables más cortas para las columnas con nombres largos
plg, plt = 'pickup_longitude', 'pickup_latitude'
dlg, dlt = 'dropoff_longitude', 'dropoff_latitude'
pdt, ddt = 'pickup_datetime', 'dropoff_datetime'

In [11]:
# función para calcular la distancia desde la recogida hasta la llegada
def haversine(lon1, lat1, lon2, lat2):
    # convertir grados decimal a radianes 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # formula haversine 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # radio de la tierra en kilometros es 6371
    km = 6371* c
    return km

def euclidian_distance(x):
    x1, y1 = np.float64(x[plg]), np.float64(x[plt])
    x2, y2 = np.float64(x[dlg]), np.float64(x[dlt])    
    return haversine(x1, y1, x2, y2)

In [12]:
# crear una columna con la distancia calculada desde la recogida hasta la llegada
df['distance'] = df[[plg, plt, dlg, dlt]].apply(euclidian_distance, axis=1)
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.98215484619139,40.76793670654297,-73.96463012695312,40.765602111816406,N,455,1.4985207796458555
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.98041534423825,40.738563537597656,-73.99948120117188,40.731151580810554,N,663,1.8055071687968436
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.97902679443358,40.763938903808594,-74.00533294677734,40.710086822509766,N,2124,6.385098495252205
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004028320312,40.719970703125,-74.01226806640625,40.70671844482422,N,429,1.4854984227709382
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.97305297851561,40.79320907592773,-73.9729232788086,40.782520294189446,N,435,1.1885884593331777


In [13]:
# convertir de string a datetime
df[pdt] = df[pdt].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
df[ddt] = df[ddt].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

In [14]:
# crear columnas a partir de la hora de recogida
df['month'] = df[pdt].apply(lambda x : x.month)
df['weekDay'] = df[pdt].apply(lambda x : x.weekday())
df['dayMonth'] = df[pdt].apply(lambda x : x.day)
df['pickupTimeMinutes'] = df[pdt].apply(lambda x : x.hour * 60.0 + x.minute)
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance,month,weekDay,dayMonth,pickupTimeMinutes
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.98215484619139,40.76793670654297,-73.96463012695312,40.765602111816406,N,455,1.4985207796458555,3,0,14,1044.0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.98041534423825,40.738563537597656,-73.99948120117188,40.731151580810554,N,663,1.8055071687968436,6,6,12,43.0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.97902679443358,40.763938903808594,-74.00533294677734,40.710086822509766,N,2124,6.385098495252205,1,1,19,695.0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004028320312,40.719970703125,-74.01226806640625,40.70671844482422,N,429,1.4854984227709382,4,2,6,1172.0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.97305297851561,40.79320907592773,-73.9729232788086,40.782520294189446,N,435,1.1885884593331777,3,5,26,810.0


In [15]:
# eliminar columnas innecesarias
df.drop(['id', pdt, ddt, dlg, dlt, 'store_and_fwd_flag'], inplace=True, axis=1)
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,trip_duration,distance,month,weekDay,dayMonth,pickupTimeMinutes
0,2,1,-73.98215484619139,40.76793670654297,455,1.4985207796458555,3,0,14,1044.0
1,1,1,-73.98041534423825,40.738563537597656,663,1.8055071687968436,6,6,12,43.0
2,2,1,-73.97902679443358,40.763938903808594,2124,6.385098495252205,1,1,19,695.0
3,2,1,-74.01004028320312,40.719970703125,429,1.4854984227709382,4,2,6,1172.0
4,2,1,-73.97305297851561,40.79320907592773,435,1.1885884593331777,3,5,26,810.0


In [16]:
# reordenar columnas
df = df[[plg, plt, 'distance', 'month', 'dayMonth', 'weekDay', 'pickupTimeMinutes', 'passenger_count', 'vendor_id', 'trip_duration']]
df.head()

Unnamed: 0,pickup_longitude,pickup_latitude,distance,month,dayMonth,weekDay,pickupTimeMinutes,passenger_count,vendor_id,trip_duration
0,-73.98215484619139,40.76793670654297,1.4985207796458555,3,14,0,1044.0,1,2,455
1,-73.98041534423825,40.738563537597656,1.8055071687968436,6,12,6,43.0,1,1,663
2,-73.97902679443358,40.763938903808594,6.385098495252205,1,19,1,695.0,1,2,2124
3,-74.01004028320312,40.719970703125,1.4854984227709382,4,6,2,1172.0,1,2,429
4,-73.97305297851561,40.79320907592773,1.1885884593331777,3,26,5,810.0,1,2,435


In [17]:
# data de entrenamiento
X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [18]:
# normalizar input
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
# Kfold para dividir la data de entrenamiento y la de test
kf = KFold(n_splits=3, shuffle=True, random_state=4305)

In [20]:
# crear modelos de Perceptrón multicapa
mlp = MLPRegressor()
param_grid = {'hidden_layer_sizes': [(9, 9), (8, 8), (7, 7), (6, 6), (5, 5), (4, 4), (3, 3), (9, 9, 9), (8, 8, 8), (7, 7, 7), (6, 6, 6), (5, 5, 5), (4, 4, 4), (3, 3, 3)],
              'activation': ['relu'],
              'solver': ['adam'],
              'nesterovs_momentum' : [True],
              'momentum' : [0.9, 0.8, 0.7, 0.6, 0.5],                          
              'learning_rate_init': [0.001, 0.01, 0.1, 0.3, 0.5, 1, 2],              
              'learning_rate' : ['adaptive'],
              'max_iter': [1000],
              'early_stopping': [False],
              'warm_start': [True]
             }

In [21]:
# crear una función para calcular el error
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean(np.square(np.subtract(np.log1p(y_true), np.log1p(y_pred)))))

In [22]:
# crear SearchCV
search_cv = RandomizedSearchCV(mlp, param_grid, scoring=make_scorer(rmsle, greater_is_better=False),
                   cv=kf, verbose=3, pre_dispatch='2*n_jobs')

In [None]:
search_cv.fit(X, y)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.5, max_iter=1000, learning_rate_init=2, learning_rate=adaptive, hidden_layer_sizes=(9, 9), early_stopping=False, activation=relu 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.5, max_iter=1000, learning_rate_init=2, learning_rate=adaptive, hidden_layer_sizes=(9, 9), early_stopping=False, activation=relu, score=-0.8201186739977941, total=  22.1s
[CV] warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.5, max_iter=1000, learning_rate_init=2, learning_rate=adaptive, hidden_layer_sizes=(9, 9), early_stopping=False, activation=relu 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.4s remaining:    0.0s


[CV]  warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.5, max_iter=1000, learning_rate_init=2, learning_rate=adaptive, hidden_layer_sizes=(9, 9), early_stopping=False, activation=relu, score=-0.8242772938494425, total=  29.7s
[CV] warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.5, max_iter=1000, learning_rate_init=2, learning_rate=adaptive, hidden_layer_sizes=(9, 9), early_stopping=False, activation=relu 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   52.4s remaining:    0.0s


[CV]  warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.5, max_iter=1000, learning_rate_init=2, learning_rate=adaptive, hidden_layer_sizes=(9, 9), early_stopping=False, activation=relu, score=-0.8230644638794341, total=  44.3s
[CV] warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.8, max_iter=1000, learning_rate_init=0.5, learning_rate=adaptive, hidden_layer_sizes=(7, 7), early_stopping=False, activation=relu 
[CV]  warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.8, max_iter=1000, learning_rate_init=0.5, learning_rate=adaptive, hidden_layer_sizes=(7, 7), early_stopping=False, activation=relu, score=-0.5049055811175778, total= 1.9min
[CV] warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.8, max_iter=1000, learning_rate_init=0.5, learning_rate=adaptive, hidden_layer_sizes=(7, 7), early_stopping=False, activation=relu 
[CV]  warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.8, max_iter=1000, learning_rate_in

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


[CV]  warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.9, max_iter=1000, learning_rate_init=0.1, learning_rate=adaptive, hidden_layer_sizes=(4, 4), early_stopping=False, activation=relu, score=-0.4924720883131459, total=  28.1s
[CV] warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.9, max_iter=1000, learning_rate_init=0.1, learning_rate=adaptive, hidden_layer_sizes=(4, 4), early_stopping=False, activation=relu 
[CV]  warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.9, max_iter=1000, learning_rate_init=0.1, learning_rate=adaptive, hidden_layer_sizes=(4, 4), early_stopping=False, activation=relu, score=-0.4956480468375843, total=  39.4s
[CV] warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.6, max_iter=1000, learning_rate_init=0.1, learning_rate=adaptive, hidden_layer_sizes=(5, 5), early_stopping=False, activation=relu 
[CV]  warm_start=True, solver=adam, nesterovs_momentum=True, momentum=0.6, max_iter=1000, learning_rate_

In [None]:
# ver Top5 modelos
results = pd.DataFrame(search_cv.cv_results_).sort_values(by='mean_test_score', ascending=False)
results.head()

In [None]:
# mejor modelo
search_cv.best_estimator_

In [None]:
# probar en data de test
df_test = pd.read_csv('test.csv')
df_test.head()

In [None]:
# preprocesar la data de test
df_test['distance'] = df_test[[plg, plt, dlg, dlt]].apply(euclidian_distance, axis=1)
df_test[pdt] = df_test[pdt].apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
df_test['month'] = df_test[pdt].apply(lambda x : x.month)
df_test['weekDay'] = df_test[pdt].apply(lambda x : x.weekday())
df_test['dayMonth'] = df_test[pdt].apply(lambda x : x.day)
df_test['pickupTimeMinutes'] = df_test[pdt].apply(lambda x : x.hour * 60.0 + x.minute)
df_test.drop(['pickup_datetime', dlg, dlt, 'store_and_fwd_flag'], inplace=True, axis=1)
df_test = df_test[['id', plg, plt, 'distance', 'month', 'dayMonth', 'weekDay', 'pickupTimeMinutes', 'passenger_count', 'vendor_id']]
df_test.head()

In [None]:
# split test data
X_id, X_test = df_test.iloc[:, 0], df_test.iloc[:, 1:]
X_id.shape, X_test.shape

In [None]:
# normalizar el input
scaler = preprocessing.StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [None]:
# predecir
y_pred = search_cv.best_estimator_.predict(X_test)
df_output = pd.DataFrame({'id' : X_id, 'trip_duration': y_pred})
df_output