In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium

# Oversampling
from imblearn.over_sampling import SMOTE

# Librerias de Machine learning
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, make_scorer
from xgboost import XGBClassifier
from yellowbrick.cluster import KElbowVisualizer
from sklearn.mixture import GaussianMixture

import joblib
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')


In [4]:
df_test = pd.read_csv('./test_set.csv')
df_test.head(3)

Unnamed: 0,trip_id,duration,start_time,end_time,start_lat,start_lon,end_lat,end_lon,bike_id,trip_route_category,start_station,end_station
0,17059130,12,1/1/2017 0:24,1/1/2017 0:36,34.058319,-118.246094,34.058319,-118.246094,6351,Round Trip,3028,3028
1,17059129,17,1/1/2017 0:28,1/1/2017 0:45,34.04998,-118.247162,34.043732,-118.260139,5836,One Way,3027,3018
2,17059126,20,1/1/2017 0:39,1/1/2017 0:59,34.063389,-118.23616,34.044159,-118.251579,6529,One Way,3066,3055


In [None]:
df_test_copy = df_test.copy()

df_test_copy['start_time'] = pd.to_datetime(df_test_copy['start_time'])
df_test_copy['end_time'] = pd.to_datetime(df_test_copy['end_time'])

df_test_copy['year'] = df_test_copy['start_time'].dt.year
df_test_copy['month'] = df_test_copy['start_time'].dt.month
df_test_copy['day_of_week'] = df_test_copy['start_time'].dt.day_of_week
df_test_copy['hour_of_day'] = df_test_copy['start_time'].dt.hour

df_test_copy['start_lat'].where(df_test_copy['start_station'] != 4496, 33.972980, inplace = True)
df_test_copy['start_lon'].where(df_test_copy['start_station'] != 4496, 	-118.423943, inplace = True)
df_test_copy['end_lat'].where(df_test_copy['end_station'] != 4496, 33.972980, inplace = True)
df_test_copy['end_lon'].where(df_test_copy['end_station'] != 4496, 	-118.423943, inplace = True)

df_test_copy['start_lat'].where(df_test_copy['start_station'] != 3039, 34.024479, inplace = True)
df_test_copy['start_lon'].where(df_test_copy['start_station'] != 3039, 	-118.393867	, inplace = True)
df_test_copy['end_lat'].where(df_test_copy['end_station'] != 3039, 34.024479, inplace = True)
df_test_copy['end_lon'].where(df_test_copy['end_station'] != 3039, 	-118.393867	, inplace = True)

df_test_copy['start_lat'].where(df_test_copy['start_station'] != 3000, 34.025841, inplace = True)
df_test_copy['start_lon'].where(df_test_copy['start_station'] != 3000, 	-118.238213, inplace = True)
df_test_copy['end_lat'].where(df_test_copy['end_station'] != 3000, 34.025841, inplace = True)
df_test_copy['end_lon'].where(df_test_copy['end_station'] != 3000, 	-118.238213, inplace = True)

df_test_copy['start_lat'].where(df_test_copy['start_station'] != 4285, 34.021756, inplace = True)
df_test_copy['start_lon'].where(df_test_copy['start_station'] != 4285, 	-118.440741, inplace = True)
df_test_copy['end_lat'].where(df_test_copy['end_station'] != 4285, 34.021756, inplace = True)
df_test_copy['end_lon'].where(df_test_copy['end_station'] != 4285, 	-118.440741, inplace = True)

df_test_copy['start_lat'].where(df_test_copy['start_station'] != 4286, 34.011992, inplace = True)
df_test_copy['start_lon'].where(df_test_copy['start_station'] != 4286, 	-118.453243, inplace = True)
df_test_copy['end_lat'].where(df_test_copy['end_station'] != 4286, 34.011992, inplace = True)
df_test_copy['end_lon'].where(df_test_copy['end_station'] != 4286, 	-118.453243, inplace = True)


for i in df_test_copy['start_station'].unique():
    lat = df_test_copy[df_test_copy['start_station'] == i]['start_lat'].min()
    lon =  df_test_copy[df_test_copy['start_station'] == i]['start_lon'].min()
    
    df_test_copy['start_lat'].where(df_test_copy['start_station'] != i, lat, inplace = True)
    df_test_copy['start_lon'].where(df_test_copy['start_station'] != i, 	lon, inplace = True)
    df_test_copy['end_lat'].where(df_test_copy['end_station'] != i, lat, inplace = True)
    df_test_copy['end_lon'].where(df_test_copy['end_station'] != i, 	lon, inplace = True)

df_test_copy['duration'].where(df_test_copy['duration'] >= 4., 4, inplace = True)
df_test_copy['duration'].where(df_test_copy['duration'] <= 59., 59., inplace = True)

df_test_copy = df_test_copy.assign(distance = lambda row: dummy_manhattan_distance(row['start_lat'], row['start_lon'], row['end_lat'], row['end_lon']) )

df_test_copy['distance'].where(df_test_copy['distance'] <= 3.7, 3.7, inplace = True)

df_test_copy['distance_cat'] = df_test_copy['distance'].apply(distance)
df_test_copy['duration_cat'] = df_test_copy['duration'].apply(lambda x: 'little' if x <= 20 else 'much')

categories = ['trip_route_category',  'year', 'day_of_week' , 'distance_cat', 'duration_cat', 'month']
to_drop = ['trip_id', 'start_time', 'end_time', 'start_lat', 'start_lon', 'end_lat', 'end_lon', 'bike_id','start_station', 'end_station']

for cat in categories:
    df_test_copy[cat] = df_test_copy[cat].astype('category')
    
df_model_test = df_test_copy.drop(to_drop, axis = 1)

coords = df_test_copy[['start_lat', 'start_lon', 'end_lat', 'end_lon']]
coords = pca_geo.transform(coords)

df_model_test['geo_1'] = coords[:, 0]
df_model_test['geo_2'] = coords[:, 1]
df_model_test['geo_3'] = coords[:, 2]
df_model_test['geo_4'] = coords[:, 3]

df_model_dummies_test = pd.get_dummies(df_model_test, drop_first = True)

X_TEST = df_model_dummies_test.values
X_TEST_RFE = rfe.transform(X_TEST)

y_predict = clf.predict(X_TEST_RFE)

df_submit = pd.DataFrame({
    'trip_id': df_test['trip_id'],
    'passholder_type': y_predict
})

df_submit['passholder_type'] = le.inverse_transform(df_submit['passholder_type'])
df_submit['passholder_type'].value_counts()

df_submit.to_csv('./submit.csv', index = False)