In [1]:
import warnings
warnings.filterwarnings('ignore')

import json
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics   
from sklearn.preprocessing import LabelEncoder 

# Import Data

In [2]:
with open('ultimate_data_challenge.json') as f:
  data = json.load(f)

In [3]:
df = pd.DataFrame(data)
print(len(df))
df.head()

50000


Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,True,46.2,3.67,5.0
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,False,50.0,8.26,5.0
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,False,100.0,0.77,5.0
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,True,80.0,2.36,4.9
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,False,82.4,3.13,4.9


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    50000 non-null  object 
 1   trips_in_first_30_days  50000 non-null  int64  
 2   signup_date             50000 non-null  object 
 3   avg_rating_of_driver    41878 non-null  float64
 4   avg_surge               50000 non-null  float64
 5   last_trip_date          50000 non-null  object 
 6   phone                   49604 non-null  object 
 7   surge_pct               50000 non-null  float64
 8   ultimate_black_user     50000 non-null  bool   
 9   weekday_pct             50000 non-null  float64
 10  avg_dist                50000 non-null  float64
 11  avg_rating_by_driver    49799 non-null  float64
dtypes: bool(1), float64(6), int64(1), object(4)
memory usage: 4.2+ MB


In [5]:
df.describe()

Unnamed: 0,trips_in_first_30_days,avg_rating_of_driver,avg_surge,surge_pct,weekday_pct,avg_dist,avg_rating_by_driver
count,50000.0,41878.0,50000.0,50000.0,50000.0,50000.0,49799.0
mean,2.2782,4.601559,1.074764,8.849536,60.926084,5.796827,4.778158
std,3.792684,0.617338,0.222336,19.958811,37.081503,5.707357,0.446652
min,0.0,1.0,1.0,0.0,0.0,0.0,1.0
25%,0.0,4.3,1.0,0.0,33.3,2.42,4.7
50%,1.0,4.9,1.0,0.0,66.7,3.88,5.0
75%,3.0,5.0,1.05,8.6,100.0,6.94,5.0
max,125.0,5.0,8.0,100.0,100.0,160.96,5.0


# Data Cleaning

## Missing values: 
### avg_ratings- fill by mean

In [6]:
mean_avg_rating_of_driver = np.round(df['avg_rating_of_driver'].mean(), 1)
mean_avg_rating_by_driver = np.round(df['avg_rating_by_driver'].mean(), 1)

In [7]:
df['avg_rating_of_driver'].fillna(mean_avg_rating_of_driver, inplace=True)
df['avg_rating_by_driver'].fillna(mean_avg_rating_by_driver, inplace=True)

### phone- dropna

In [8]:
df = df.dropna(subset=['phone'])

## Data types casting

In [9]:
# number of unique cities
df['city'].nunique()

3

In [10]:
# number of unique phones
df['phone'].nunique()

2

In [11]:
df['signup_date'] = pd.to_datetime(df['signup_date'])
df['last_trip_date'] = pd.to_datetime(df['last_trip_date'])

In [12]:
df = df.astype({'city': 'category', 'phone': 'category'})

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49604 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   city                    49604 non-null  category      
 1   trips_in_first_30_days  49604 non-null  int64         
 2   signup_date             49604 non-null  datetime64[ns]
 3   avg_rating_of_driver    49604 non-null  float64       
 4   avg_surge               49604 non-null  float64       
 5   last_trip_date          49604 non-null  datetime64[ns]
 6   phone                   49604 non-null  category      
 7   surge_pct               49604 non-null  float64       
 8   ultimate_black_user     49604 non-null  bool          
 9   weekday_pct             49604 non-null  float64       
 10  avg_dist                49604 non-null  float64       
 11  avg_rating_by_driver    49604 non-null  float64       
dtypes: bool(1), category(2), datetime64[ns](2), fl

### -- Data Cleaned -- 

In [14]:
df.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,True,46.2,3.67,5.0
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,False,50.0,8.26,5.0
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,False,100.0,0.77,5.0
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,True,80.0,2.36,4.9
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,False,82.4,3.13,4.9


# USER RETENTION

In [15]:
months = []
def getMonth(x):
    m = x.month
    #if m not in months:
    months.append(m)
    
df['last_trip_date'].apply(getMonth)
pd.Series(months).value_counts()

6    18126
1    10017
5     7548
4     4568
3     4532
2     4268
7      545
dtype: int64

In [16]:
def checkIfUserRetained(last_trip_date):
    if last_trip_date.month > 5:
        return True
    else:
        return False

In [17]:
df['user_retained'] = df['last_trip_date'].apply(checkIfUserRetained)
df['user_retained'].value_counts()

False    30933
True     18671
Name: user_retained, dtype: int64

In [18]:
usersRetained = int(len(df[df['user_retained']]) / len(df) * 100)
print('Percentage of Total users retained: {}%'.format(usersRetained))

Percentage of Total users retained: 37%


## Build a predictive model to determine what factors influence user retention

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49604 entries, 0 to 49999
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   city                    49604 non-null  category      
 1   trips_in_first_30_days  49604 non-null  int64         
 2   signup_date             49604 non-null  datetime64[ns]
 3   avg_rating_of_driver    49604 non-null  float64       
 4   avg_surge               49604 non-null  float64       
 5   last_trip_date          49604 non-null  datetime64[ns]
 6   phone                   49604 non-null  category      
 7   surge_pct               49604 non-null  float64       
 8   ultimate_black_user     49604 non-null  bool          
 9   weekday_pct             49604 non-null  float64       
 10  avg_dist                49604 non-null  float64       
 11  avg_rating_by_driver    49604 non-null  float64       
 12  user_retained           49604 non-null  bool  

In [20]:
df.head()

Unnamed: 0,city,trips_in_first_30_days,signup_date,avg_rating_of_driver,avg_surge,last_trip_date,phone,surge_pct,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver,user_retained
0,King's Landing,4,2014-01-25,4.7,1.1,2014-06-17,iPhone,15.4,True,46.2,3.67,5.0,True
1,Astapor,0,2014-01-29,5.0,1.0,2014-05-05,Android,0.0,False,50.0,8.26,5.0,False
2,Astapor,3,2014-01-06,4.3,1.0,2014-01-07,iPhone,0.0,False,100.0,0.77,5.0,False
3,King's Landing,9,2014-01-10,4.6,1.14,2014-06-29,iPhone,20.0,True,80.0,2.36,4.9,True
4,Winterfell,14,2014-01-27,4.4,1.19,2014-03-15,Android,11.8,False,82.4,3.13,4.9,False


In [21]:
le = LabelEncoder() 
  
df['city'] = le.fit_transform(df['city'])
df['phone'] = le.fit_transform(df['phone'])

In [22]:
df = df.astype({'city': 'category', 'phone': 'category'})

In [23]:
X = df.drop(columns=['user_retained', 'signup_date', 'last_trip_date'])
y = df[['user_retained']]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((34722, 10), (14882, 10), (34722, 1), (14882, 1))

In [25]:
clf = RandomForestClassifier(n_estimators = 500, max_depth=10, bootstrap = True)   
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test) 
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred)) 

ACCURACY OF THE MODEL:  0.7849751377503024


## Feature Importance

In [26]:
importance = clf.feature_importances_
importance = pd.DataFrame(importance, columns=['importance'])
importance.index = X.columns
importance.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
avg_rating_by_driver,0.193163
surge_pct,0.142844
weekday_pct,0.133513
city,0.122669
avg_surge,0.089828
phone,0.085716
avg_dist,0.067341
ultimate_black_user,0.067224
trips_in_first_30_days,0.06502
avg_rating_of_driver,0.032682


#### Interestingly, the most important factor for retaing the user is 'avg_rating_by_driver'

### The next most import factors are 'surge_pct' and 'weekday_pct'

If Ultimate can reduce the number of trips with a surge multiplier, and encourage more riders to take a ride in the weekdays, they should be able to improve their User retention rate.