In [1]:
import numpy as np
import random
import keras.backend as K
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
%matplotlib widget

In [2]:
def set_seed(seed: int = 42) -> None:
    '''
    Sets all random seeds for the program (Python, NumPy) in order to get reproducible results.
    Args:
        seed: an int for the random seed
    '''
    if not isinstance(seed, int):
        raise ValueError(
            'Expected `seed` argument to be an integer. '
            f'Received: seed={seed} (of type {type(seed)})')
    random.seed(seed)
    np.random.seed(seed)
    print(f"Random seed set as {seed}")

In [3]:
set_seed(42)

Random seed set as 42


In [4]:
df_cab = pd.read_csv('cab_rides.csv').dropna()
df_cab['time_stamp'] = pd.to_datetime(df_cab['time_stamp'], unit='ms')

In [5]:
df_cab = df_cab.sort_values('time_stamp')

In [6]:
df_weather = pd.read_csv('weather.csv')
df_weather['time_stamp'] = pd.to_datetime(df_weather['time_stamp'], unit='s')
df_weather['rain'] = df_weather['rain'].fillna(0)
df_weather = df_weather.sort_values('time_stamp')

In [7]:
df = pd.merge_asof(df_cab, df_weather, on='time_stamp')

In [8]:
df.head()

Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,product_id,name,temp,location,clouds,pressure,rain,humidity,wind
0,3.03,Lyft,2018-11-26 03:40:46.318,Theatre District,Boston University,34.0,1.0,ef4771c2-c88d-4730-aaf7-a95751e9d27e,lyft_luxsuv,Lux Black XL,41.02,North End,0.87,1014.4,0.0,0.92,1.59
1,1.3,Uber,2018-11-26 03:40:46.319,Theatre District,South Station,18.5,1.0,00ea74ea-2c49-416c-bfc5-f7877025f6eb,6c84fd89-3f11-4782-9b50-97c468b19529,Black,41.02,North End,0.87,1014.4,0.0,0.92,1.59
2,2.43,Lyft,2018-11-26 03:40:46.320,Beacon Hill,Northeastern University,10.5,1.0,edfc7f44-97e1-48cd-930c-e4fe20e88ac8,lyft,Lyft,41.02,North End,0.87,1014.4,0.0,0.92,1.59
3,2.71,Uber,2018-11-26 03:40:46.320,Fenway,Theatre District,32.0,1.0,6172077a-22de-481b-aae2-b5763c87a6c4,6f72dfc5-27f1-42e8-84db-ccc7a75f6969,UberXL,41.02,North End,0.87,1014.4,0.0,0.92,1.59
4,2.71,Uber,2018-11-26 03:40:46.320,Fenway,Theatre District,19.5,1.0,8682f9bf-5cc0-4dfc-b8fe-4e22070d1684,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX,41.02,North End,0.87,1014.4,0.0,0.92,1.59


In [9]:
features = ['cab_type', 'name', 'temp', 'clouds', 'rain', 'humidity', 'wind', 'time_stamp', 'distance']
df_X = df[features].copy()
df_X['hour'] = df_X['time_stamp'].dt.hour
df_X = df_X[(df.name == 'Lyft') | (df.name == 'UberX')].drop(['cab_type', 'name', 'time_stamp'], axis=1)
df_y = df[(df.name == 'Lyft') | (df.name == 'UberX')]['surge_multiplier']

In [10]:
df_X.head()

Unnamed: 0,temp,clouds,rain,humidity,wind,distance,hour
2,41.02,0.87,0.0,0.92,1.59,2.43,3
4,41.02,0.87,0.0,0.92,1.59,2.71,3
7,41.02,0.87,0.0,0.92,1.59,2.19,3
14,41.02,0.87,0.0,0.92,1.59,2.3,3
15,41.02,0.87,0.0,0.92,1.59,0.56,3


In [11]:
df_y.head()

2     1.0
4     1.0
7     1.0
14    1.0
15    1.0
Name: surge_multiplier, dtype: float64

In [12]:
df_y.value_counts()

1.00    102134
1.25      2217
1.50      1013
1.75       484
2.00       398
2.50        77
3.00         6
Name: surge_multiplier, dtype: int64

In [13]:
# clip surge multiplier at 2.5
df_y.loc[df_y == 3.0] = 2.5
# label mapping for XGBoost
mapping = {item: i for i, item in enumerate(df_y.unique())}
df_y = df_y.map(mapping)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_y.loc[df_y == 3.0] = 2.5


In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_X.to_numpy(), 
                                                    df_y.to_numpy(), random_state=42,
                                                    test_size=0.2)

X_val, X_test, y_val, y_test = train_test_split(X_test, 
                                                y_test, random_state=42,
                                                test_size=0.2)

In [15]:
def z_standardize(x, mu, sigma):
    return (x - mu) / sigma

mu = X_train.mean(axis=0).reshape(1, -1)
sigma = X_train.std(axis=0).reshape(1, -1)

X_train = z_standardize(X_train, mu=mu, sigma=sigma)
X_val = z_standardize(X_val, mu=mu, sigma=sigma)
X_test = z_standardize(X_test, mu=mu, sigma=sigma)

In [16]:
scales = np.vstack([mu, sigma])

In [17]:
np.save('scales.npy', scales)

In [18]:
X_train.shape, y_train.shape

((85063, 7), (85063,))

In [19]:
X_val.shape, y_val.shape

((17012, 7), (17012,))

In [20]:
X_test.shape, y_test.shape

((4254, 7), (4254,))

In [21]:
clf = RandomForestClassifier(n_jobs=-1, random_state=42, class_weight='balanced')

clf.fit(X_train, y_train)

In [22]:
accuracy_score(y_test, clf.predict(X_test))

0.925716972261401

In [23]:
f1_score(y_test, clf.predict(X_test), average='weighted')

0.9275277662953022

In [24]:
joblib.dump(clf, 'random_forest.pkl', compress=9)

['random_forest.pkl']

In [25]:
model = joblib.load('random_forest.pkl')