In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Models from Scikit-learn
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor

# Model evaluations
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
# Import training data
df = pd.read_csv("sigma_cabs.csv", low_memory=False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131662 entries, 0 to 131661
Data columns (total 14 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Trip_ID                      131662 non-null  object 
 1   Trip_Distance                131662 non-null  float64
 2   Type_of_Cab                  111452 non-null  object 
 3   Customer_Since_Months        125742 non-null  float64
 4   Life_Style_Index             111469 non-null  float64
 5   Confidence_Life_Style_Index  111469 non-null  object 
 6   Destination_Type             131662 non-null  object 
 7   Customer_Rating              131662 non-null  float64
 8   Cancellation_Last_1Month     131662 non-null  int64  
 9   Var1                         60632 non-null   float64
 10  Var2                         131662 non-null  int64  
 11  Var3                         131662 non-null  int64  
 12  Gender                       131662 non-null  object 
 13 

In [4]:
df.isna().sum()

Trip_ID                            0
Trip_Distance                      0
Type_of_Cab                    20210
Customer_Since_Months           5920
Life_Style_Index               20193
Confidence_Life_Style_Index    20193
Destination_Type                   0
Customer_Rating                    0
Cancellation_Last_1Month           0
Var1                           71030
Var2                               0
Var3                               0
Gender                             0
Surge_Pricing_Type                 0
dtype: int64

In [5]:
# Find the columns which contains string
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

Trip_ID
Type_of_Cab
Confidence_Life_Style_Index
Destination_Type
Gender


In [6]:
# This will change all the string columns to categorical values
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        df[label] = content.astype("category").cat.as_ordered()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131662 entries, 0 to 131661
Data columns (total 14 columns):
 #   Column                       Non-Null Count   Dtype   
---  ------                       --------------   -----   
 0   Trip_ID                      131662 non-null  category
 1   Trip_Distance                131662 non-null  float64 
 2   Type_of_Cab                  111452 non-null  category
 3   Customer_Since_Months        125742 non-null  float64 
 4   Life_Style_Index             111469 non-null  float64 
 5   Confidence_Life_Style_Index  111469 non-null  category
 6   Destination_Type             131662 non-null  category
 7   Customer_Rating              131662 non-null  float64 
 8   Cancellation_Last_1Month     131662 non-null  int64   
 9   Var1                         60632 non-null   float64 
 10  Var2                         131662 non-null  int64   
 11  Var3                         131662 non-null  int64   
 12  Gender                       131662 non-null

In [8]:
(df.isnull().sum()/len(df))*100 # To see in percentage

Trip_ID                         0.000000
Trip_Distance                   0.000000
Type_of_Cab                    15.349911
Customer_Since_Months           4.496362
Life_Style_Index               15.336999
Confidence_Life_Style_Index    15.336999
Destination_Type                0.000000
Customer_Rating                 0.000000
Cancellation_Last_1Month        0.000000
Var1                           53.948748
Var2                            0.000000
Var3                            0.000000
Gender                          0.000000
Surge_Pricing_Type              0.000000
dtype: float64

In [9]:
# Export current temp dataframe
df.to_csv("train_tmp.csv", index=False)

In [10]:
# Import preprocessed data
df = pd.read_csv("train_tmp.csv", low_memory=False)

In [11]:
df.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
0,T0005689460,6.77,B,1.0,2.42769,A,A,3.905,0,40.0,46,60,Female,2
1,T0005689461,29.47,B,10.0,2.78245,B,A,3.45,0,38.0,56,78,Male,2
2,T0005689464,41.58,,10.0,,,E,3.50125,2,,56,77,Male,2
3,T0005689465,61.56,C,10.0,,,A,3.45375,0,,52,74,Male,3
4,T0005689467,54.95,C,10.0,3.03453,B,A,3.4025,4,51.0,49,102,Male,2


In [12]:
df.isna().sum()

Trip_ID                            0
Trip_Distance                      0
Type_of_Cab                    20210
Customer_Since_Months           5920
Life_Style_Index               20193
Confidence_Life_Style_Index    20193
Destination_Type                   0
Customer_Rating                    0
Cancellation_Last_1Month           0
Var1                           71030
Var2                               0
Var3                               0
Gender                             0
Surge_Pricing_Type                 0
dtype: int64

## Fill missing values

### Fill numeric missing value first (Original numeric values)

In [13]:
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

Trip_Distance
Customer_Since_Months
Life_Style_Index
Customer_Rating
Cancellation_Last_1Month
Var1
Var2
Var3
Surge_Pricing_Type


In [14]:
# Check for which numeric values there is NULL values
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content)&(pd.isna(content).sum()!=0):
        print(label)

Customer_Since_Months
Life_Style_Index
Var1


In [15]:
# Now lets fill both of these columns with median values
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content)&(pd.isna(content).sum()!=0):
        # Add a binary column for that feature having null values (If the data was missing)
        df[label+"_is_missing"] = pd.isnull(content) # For future it will only show if the column was missing or not by TRUE or False
        #will give TRUE if NULL
        # Fill missing numeric value with median
        df[label] = content.fillna(content.median()) #median is mnore robust than the mean 

In [16]:
# Demonstrating how median is more robust than mean
hundreds = np.full((1000),100)
hundreds_billion = np.append(hundreds, 1000000000)

np.mean(hundreds), np.mean(hundreds_billion),   np.median(hundreds), np.median(hundreds_billion)

(100.0, 999100.8991008991, 100.0, 100.0)

In [17]:
# Lets check if now is there any NULL numeric value
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content)&(pd.isna(content).sum()!=0):
        print(label)

In [18]:
# Let's check how many examples were missing
df.Customer_Since_Months.value_counts()

10.0    42680
6.0     13295
2.0     11621
3.0     10351
0.0     10169
5.0      8641
1.0      8297
4.0      7726
7.0      7407
8.0      6328
9.0      5147
Name: Customer_Since_Months, dtype: int64

In [19]:
df.isna().sum()

Trip_ID                                 0
Trip_Distance                           0
Type_of_Cab                         20210
Customer_Since_Months                   0
Life_Style_Index                        0
Confidence_Life_Style_Index         20193
Destination_Type                        0
Customer_Rating                         0
Cancellation_Last_1Month                0
Var1                                    0
Var2                                    0
Var3                                    0
Gender                                  0
Surge_Pricing_Type                      0
Customer_Since_Months_is_missing        0
Life_Style_Index_is_missing             0
Var1_is_missing                         0
dtype: int64

In [20]:
# Now lets fill the rest of the missing values (ie categorical we converted)
# so now lets find columns that are not numeric data (Categorical)
for label, content in df.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

Trip_ID
Type_of_Cab
Confidence_Life_Style_Index
Destination_Type
Gender


In [21]:
# Turn categorical variables into numbers and fill missing
for label, content in df.items():
    if not pd.api.types.is_numeric_dtype(content):
        # Add binary column to indicate whether sample had missing value
        df[label+"_is_missing"] = pd.isnull(content)
        # Turn categories into numbers and add +1
        # Adding 1 as when there was a missing value its code is -1 so to make it 0 shown in abov cell
        df[label] = pd.Categorical(content).codes+1

In [22]:
df.isna().sum()

Trip_ID                                   0
Trip_Distance                             0
Type_of_Cab                               0
Customer_Since_Months                     0
Life_Style_Index                          0
Confidence_Life_Style_Index               0
Destination_Type                          0
Customer_Rating                           0
Cancellation_Last_1Month                  0
Var1                                      0
Var2                                      0
Var3                                      0
Gender                                    0
Surge_Pricing_Type                        0
Customer_Since_Months_is_missing          0
Life_Style_Index_is_missing               0
Var1_is_missing                           0
Trip_ID_is_missing                        0
Type_of_Cab_is_missing                    0
Confidence_Life_Style_Index_is_missing    0
Destination_Type_is_missing               0
Gender_is_missing                         0
dtype: int64

### Splitting Data into Train and Validation sets

In [23]:
X = df.drop("Surge_Pricing_Type", axis=1)
y = df["Surge_Pricing_Type"]

In [24]:
#Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## So there is no missing values as well as all our data is NUMERIC now we can build a machine learning model

In [25]:
%%time    
# Put Models in a dictionary
models = {"SGDRegressor": SGDRegressor(max_iter=1000, tol=1e-3),
          "Support Vector Machine": SVR(),
          "Random Forest": RandomForestRegressor()}

# Create a function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates given machine learning models.
    models : a dict of different Scikit-Learn machine learning models
    X_train : training data (no labels)
    X_test : testing data (no labels)
    y_train : training labels
    y_test : test labels
    """
    
    # Set Random seed
    np.random.seed(42)
    
    # Make a dictionary to keep the model score
    model_score = {}
    
    # Loop through the models
    for name,model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        # Evaluate the model and apend its score to the model_score
        model_score[name] = model.score(X_test, y_test)
    return model_score

Wall time: 0 ns


In [26]:
model_scores = fit_and_score(models, X_train, X_test, y_train, y_test)

model_scores

{'SGDRegressor': -5.930150392923666e+33,
 'Support Vector Machine': -0.005079598633456195,
 'Random Forest': 0.3986540210782651}

#### As Random Forrest model has the highest R2 value we will use it for hyperparameter tunning

In [27]:
model = RandomForestRegressor(random_state=42)

## Building an evaluation function

In [28]:
# Create evaluation function (It uses RMSLE)
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score

# We need root mean squared log error (RMSLE) so make a function
def rmsle(y_test, y_preds):
    """
    Calculate Root mean squared log error between predictions and tru labels.
    """
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# Calculate function to evaluate model on different levels
def show_scores(model):
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Test MAE": mean_absolute_error(y_test, test_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Test RMSLE": rmsle(y_test, test_preds),
              "Training R^2": r2_score(y_train, train_preds),   # or use model.score()
              "Test R^2": r2_score(y_test, test_preds),
              "ACCURACY on training data": model.score(X_train, y_train)}
    
    return scores

In [29]:
model.fit(X_train, y_train)
show_scores(model)

{'Training MAE': 0.1621035991987012,
 'Test MAE': 0.4407207686173243,
 'Training RMSLE': 0.07737441022266309,
 'Test RMSLE': 0.1970501253714191,
 'Training R^2': 0.9153118376660967,
 'Test R^2': 0.399021936587889,
 'ACCURACY on training data': 0.9153118376660967}

As our model have good trainning R2 value but low test R2 value so its a case of overfitting we will try to do hyperparameter tunning

## Hyperparameters tunning with RandomizedSearchCV

we're going to tune:
* Random Forest classifier model() 

using RandomizedSearchCV

Now we've got hyperparameter grids setup for each of our model. let's tune them using RandomizedSearchCV

In [30]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [31]:
np.random.seed(42)

# Setup random hyperparameter search for logistic regression
rs_rf = RandomizedSearchCV(RandomForestRegressor(),
                            param_distributions=random_grid,
                            cv=2,
                            n_iter=5,
                            verbose=True)

# Fit random hyperparameter search for logistic regression
rs_rf.fit(X_train, y_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits


RandomizedSearchCV(cv=2, estimator=RandomForestRegressor(), n_iter=5,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   verbose=True)

In [32]:
rs_rf.best_params_

{'n_estimators': 600,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 90,
 'bootstrap': False}

In [33]:
# Evaluate the Random Forest
rs_rf.score(X_test, y_test)

0.4128670915058853

In [37]:
model = RandomForestRegressor(n_estimators= 600,
 min_samples_split= 10,
 min_samples_leaf= 4,
 max_features= 'sqrt',
 max_depth= 90,
 bootstrap= False)

model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, max_depth=90, max_features='sqrt',
                      min_samples_leaf=4, min_samples_split=10,
                      n_estimators=600)

In [40]:
y_pred = model.predict(X_test)
y_pred, y_test

(array([2.76122993, 1.88704604, 2.38506851, ..., 2.20968938, 2.83313827,
        1.78865175]),
 116985    3
 76043     2
 33613     2
 75756     3
 106084    3
          ..
 72863     3
 84066     2
 118523    2
 116536    2
 116537    2
 Name: Surge_Pricing_Type, Length: 26333, dtype: int64)