In [4]:
# importing modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
plt.rc('figure', figsize=(14.0, 7.0))

# Data Wrangling and EDA

In [2]:
# reading the data
df_ult = pd.read_json('ultimate_data_challenge.json')
df_ult.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,True,46.2,3.67,5.0
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,False,50.0,8.26,5.0
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,False,100.0,0.77,5.0
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,True,80.0,2.36,4.9
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,False,82.4,3.13,4.9


In [3]:
# converting dates to datetime objects
df_ult['signup_date'] = pd.to_datetime(df_ult['signup_date'])
df_ult['last_trip_date'] = pd.to_datetime(df_ult['last_trip_date'])

In [4]:
df_ult.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   city                    50000 non-null  object        
 1   trips_in_first_30_days  50000 non-null  int64         
 2   signup_date             50000 non-null  datetime64[ns]
 3   avg_rating_of_driver    41878 non-null  float64       
 4   avg_surge               50000 non-null  float64       
 5   last_trip_date          50000 non-null  datetime64[ns]
 6   phone                   49604 non-null  object        
 7   surge_pct               50000 non-null  float64       
 8   ultimate_black_user     50000 non-null  bool          
 9   weekday_pct             50000 non-null  float64       
 10  avg_dist                50000 non-null  float64       
 11  avg_rating_by_driver    49799 non-null  float64       
dtypes: bool(1), datetime64[ns](2), float64(6), int

In [5]:
df_ult.describe()

Unnamed: 0,trips_in_first_30_days,avg_rating_of_driver,avg_surge,surge_pct,weekday_pct,avg_dist,avg_rating_by_driver
count,50000.0,41878.0,50000.0,50000.0,50000.0,50000.0,49799.0
mean,2.2782,4.601559,1.074764,8.849536,60.926084,5.796827,4.778158
std,3.792684,0.617338,0.222336,19.958811,37.081503,5.707357,0.446652
min,0.0,1.0,1.0,0.0,0.0,0.0,1.0
25%,0.0,4.3,1.0,0.0,33.3,2.42,4.7
50%,1.0,4.9,1.0,0.0,66.7,3.88,5.0
75%,3.0,5.0,1.05,8.6,100.0,6.94,5.0
max,125.0,5.0,8.0,100.0,100.0,160.96,5.0


The user is considered retained if it was active during the preceding 30 days after several months since one signed up.

In [6]:
def retained_user(df):
    '''This function creates a timestamp 3 months from the signup date and checks if the user's last trip was withing 30 days
       prior to that date; if yes it outputs 1 and it outputs 0 otherwise to the retained_users column'''
    retained_users = []
    for i in range(len(df)):
        signup_time = pd.Timestamp(df_ult['signup_date'][i]) # signup date for a user
        end_time = signup_time + pd.Timedelta('90D') # adding 3 months to user's signup date
        retained_time = end_time - pd.Timedelta('30D') # setting the date 30 days prior to end_time
        if retained_time <= pd.Timestamp(df_ult['last_trip_date'][i]) <= end_time: #checkinf if user's last_trip_date falls withing his end_time and retained_time
            retained_users.append([1])
        else:
            retained_users.append([0])
    df_ret = pd.DataFrame(retained_users, columns = ['retained'])
    return df_ret
        

In [7]:
df_ret = retained_user(df_ult)

In [8]:
print(f'There were 4545 retained users out of {len(df_ret)} users which is {round(((4545/45455) * 100), 4)}%, or roughly {round(((4545/45455) * 100))}%')

There were 4545 retained users out of 50000 users which is 9.9989%, or roughly 10%


In [9]:
df = pd.merge(df_ult, df_ret, left_index=True, right_index=True)

In [10]:
df.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,retained
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,True,46.2,3.67,5.0,0
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,False,50.0,8.26,5.0,0
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,False,100.0,0.77,5.0,0
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,True,80.0,2.36,4.9,0
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,False,82.4,3.13,4.9,0


In [11]:
df['retained'].value_counts()

0    45455
1     4545
Name: retained, dtype: int64

In [12]:
print(f'There were 4545 retained users out of 45455 users which is approximately {round((4545/45455)*100)}%')

There were 4545 retained users out of 45455 users which is approximately 10%


In [13]:
df['ultimate_black_user'].value_counts()

False    31146
True     18854
Name: ultimate_black_user, dtype: int64

In [14]:
df['ultimate_black_user'].dtype

dtype('bool')

# Preprocessing and Modeling

We want to predict if the user will be active on his/her 6th month on the program. This is a classification problem, so we will use several models. We will start with classic model for classification problems - Logistic Regression.

In [15]:
df['ultimate_black_user'] = df['ultimate_black_user'].astype(str).map({'True': 1, 'False': 0})

In [16]:
X = df.drop('retained', axis=1)
y = df['retained']
print(X.shape)
print(y.shape)

(50000, 12)
(50000,)


In [17]:
#creating dummy variables for categorical features
dummy = pd.get_dummies(X[['city', 'phone']])

In [18]:
#mergin original df with dummy df
X = X.merge(dummy, left_index=True, right_index=True)

In [19]:
#dropping redundant features and filling NaNs with 0
X.drop(['city', 'phone', 'signup_date', 'last_trip_date'], axis=1, inplace=True)
X.fillna(0, inplace=True)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [21]:
#scaling the data
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [22]:
y.value_counts()

0    45455
1     4545
Name: retained, dtype: int64

In [23]:
X

Unnamed: 0,trips_in_first_30_days,avg_rating_of_driver,avg_surge,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,city_Astapor,city_King's Landing,city_Winterfell,phone_Android,phone_iPhone
0,4,4.7,1.10,15.4,1,46.2,3.67,5.0,0,1,0,0,1
1,0,5.0,1.00,0.0,0,50.0,8.26,5.0,1,0,0,1,0
2,3,4.3,1.00,0.0,0,100.0,0.77,5.0,1,0,0,0,1
3,9,4.6,1.14,20.0,1,80.0,2.36,4.9,0,1,0,0,1
4,14,4.4,1.19,11.8,0,82.4,3.13,4.9,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0,5.0,1.00,0.0,0,100.0,5.63,4.2,0,1,0,0,1
49996,1,0.0,1.00,0.0,0,0.0,0.00,4.0,1,0,0,0,1
49997,0,5.0,1.00,0.0,1,100.0,3.86,5.0,0,0,1,1,0
49998,2,3.0,1.00,0.0,0,100.0,4.58,3.5,1,0,0,0,1


## Logistic Regression

In [24]:
lrc_noparams = LogisticRegression()
lrc_noparams.fit(X_train, y_train)
lrc_noparams_train_ypred = lrc_noparams.predict(X_train)
lrc_noparams_test_ypred = lrc_noparams.predict(X_test)
print('TRAIN SPLIT ACCURACY: ', accuracy_score(y_train, lrc_noparams_train_ypred))
print('TEST SPLIT ACCURACY: ', accuracy_score(y_test, lrc_noparams_test_ypred))

TRAIN SPLIT ACCURACY:  0.9091
TEST SPLIT ACCURACY:  0.9091


In [25]:
print("=== TRAIN SPLIT Classification Report ===")
print(classification_report(y_train, lrc_noparams_train_ypred))
print("=== TEST SPLIT Classification Report ===")
print(classification_report(y_test, lrc_noparams_test_ypred))

=== TRAIN SPLIT Classification Report ===
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     36364
           1       0.00      0.00      0.00      3636

    accuracy                           0.91     40000
   macro avg       0.45      0.50      0.48     40000
weighted avg       0.83      0.91      0.87     40000

=== TEST SPLIT Classification Report ===
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      9091
           1       0.00      0.00      0.00       909

    accuracy                           0.91     10000
   macro avg       0.45      0.50      0.48     10000
weighted avg       0.83      0.91      0.87     10000



In [26]:
lrc_params = LogisticRegression()
C_params = [0.001, 0.01, 0.1, 1, 10, 100]
c = [c for c in C_params]
grid_params = {'C':c}

In [27]:
lrc_grid = GridSearchCV(lrc_params, param_grid = grid_params, cv=5, n_jobs=-1)
lrc_grid.fit(X_train, y_train)
lrc_grid.best_params_

{'C': 0.001}

In [28]:
lrc_params = LogisticRegression(C=0.001, random_state=42)
lrc_params.fit(X_train, y_train)
lrc_params_train_ypred = lrc_params.predict(X_train)
lrc_params_test_ypred = lrc_params.predict(X_test)

In [29]:
print('TRAIN SPLIT ACCURACY: ', accuracy_score(y_train, lrc_params_train_ypred))
print('TEST SPLIT ACCURACY: ', accuracy_score(y_test, lrc_params_test_ypred))

TRAIN SPLIT ACCURACY:  0.9091
TEST SPLIT ACCURACY:  0.9091


In [30]:
print("=== TRAIN SPLIT Classification Report ===")
print(classification_report(y_train, lrc_noparams_train_ypred))
print("=== TEST SPLIT Classification Report ===")
print(classification_report(y_test, lrc_noparams_test_ypred))

=== TRAIN SPLIT Classification Report ===
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     36364
           1       0.00      0.00      0.00      3636

    accuracy                           0.91     40000
   macro avg       0.45      0.50      0.48     40000
weighted avg       0.83      0.91      0.87     40000

=== TEST SPLIT Classification Report ===
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      9091
           1       0.00      0.00      0.00       909

    accuracy                           0.91     10000
   macro avg       0.45      0.50      0.48     10000
weighted avg       0.83      0.91      0.87     10000



## Random Forest Classifier

In [31]:
rfc = RandomForestClassifier(oob_score=True)
rfc.fit(X_train, y_train)
rfcy_pred_np = rfc.predict(X_test)
print('ACCURACY SCORE: ', accuracy_score(rfcy_pred_np, y_test))
print('Out-of-bag SCORE: ', rfc.oob_score_)

ACCURACY SCORE:  0.8873
Out-of-bag SCORE:  0.88765


In [32]:
print("=== Classification Report ===")
print(classification_report(y_test, rfcy_pred_np))

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      9091
           1       0.21      0.09      0.13       909

    accuracy                           0.89     10000
   macro avg       0.56      0.53      0.53     10000
weighted avg       0.85      0.89      0.87     10000



In [33]:
#number of trees
n_estimators = [int(i) for i in np.linspace(200, 2000, 10)]

#number of features for each split
max_features = ['auto', 'sqrt']

#maximal depth
max_depth = [int(i) for i in np.linspace(100, 500, 11)]

#random grid
random_grid = {'n_estimators':n_estimators, 'max_features':max_features, 'max_depth':max_depth}

In [37]:
#randomized search
rfc_random = RandomizedSearchCV(estimator=rfc, param_distributions=random_grid, n_iter=100, cv=5, random_state=42, n_jobs=-1)

#fitting the model
rfc_random.fit(X_train, y_train)

print(rfc_random.best_params_)

{'n_estimators': 200, 'max_features': 'auto', 'max_depth': 340}


In [38]:
rfc_params = RandomForestClassifier(n_estimators=200, max_features='auto', max_depth=340, oob_score=True)
rfc_params.fit(X_train, y_train)
rfc_ypred_params = rfc_params.predict(X_test)
print('ACCURACY SCORE: ', accuracy_score(rfc_ypred_params, y_test))
print('Out-of-bag SCORE: ', rfc_params.oob_score_)

ACCURACY SCORE:  0.8872
Out-of-bag SCORE:  0.88755


In [39]:
print("=== RANDOM FOREST TEST SET Classification Report ===")
print(classification_report(y_test, rfc_ypred_params))

=== RANDOM FOREST TEST SET Classification Report ===
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      9091
           1       0.21      0.09      0.12       909

    accuracy                           0.89     10000
   macro avg       0.56      0.53      0.53     10000
weighted avg       0.85      0.89      0.87     10000

