In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.tseries.offsets import DateOffset
import statsmodels.discrete.discrete_model as sm

%matplotlib inline

# Loading Data

Load dataframe into pandas, generate target variable

In [None]:
df = pd.read_csv("data/churn_train.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df['last_trip_date'] = pd.to_datetime(df['last_trip_date'])

In [None]:
current_date = max(df['last_trip_date'])

In [None]:
one_month_back = current_date-DateOffset(months=1)

In [None]:
df["label"] = df['last_trip_date']<one_month_back

# EDA

Generate intial plots of data

In [None]:
df.head()

In [None]:
sns.countplot(df["label"])

In [None]:
df["label"].value_counts()

In [None]:
categories = ["city", "phone"]

In [None]:
for i in categories:
    print df[i].value_counts()

In [None]:
sns.lmplot("trips_in_first_30_days", "surge_pct", df, hue = "label")

Create a dummy variable for null values of average rating of driver. Look at the spread of these ratings

In [None]:
df.isnull().sum()

In [None]:
not_null = df.avg_rating_of_driver.isnull()==False
null = df.avg_rating_of_driver.isnull()==True

In [None]:
#df['avg_rating_of_driver_imputed'] = df.avg_rating_of_driver.isnull()

df.head()
mode = df.avg_rating_of_driver[not_null].mode()
median = df.avg_rating_of_driver[not_null].median()

In [None]:
df.loc[null,'avg_rating_of_driver'] = 1.0

In [None]:
not_null_by_driver = df.avg_rating_by_driver.isnull()==False
null_by_driver = df.avg_rating_by_driver.isnull()==True

#df['avg_rating_by_driver_imputed'] = df.avg_rating_by_driver.isnull()

df.head()
mode1 = df.avg_rating_by_driver[not_null].mode()
median1 = df.avg_rating_by_driver[not_null].median()

df.loc[null_by_driver,'avg_rating_by_driver'] = 1.0


In [None]:
not_null_phone = df.phone.isnull()==False
null_phone = df.phone.isnull()==True

#df['avg_rating_phone_imputed'] = df.phone.isnull()

df.loc[null_phone,'phone'] = 'other'

In [None]:
df.isnull().sum()

Given the spread of the data, it is clear any rating below a 4 out of 5 is low. Update average ratings of driver variable to a simple high/low indicator.

In [None]:
def rating(i):
    if i >= 4 :
        return "high"
    else:
        return "low"
            

In [None]:
df["avg_rating_of_driver"] = df["avg_rating_of_driver"].apply(rating)
df["avg_rating_by_driver"] = df["avg_rating_by_driver"].apply(rating)

Engineer a 'time_as_user' feature 

In [None]:
df.signup_date = pd.to_datetime(df['signup_date'])

In [None]:
df['time_as_user'] = one_month_back - df.signup_date
df['time_as_user'] = df.time_as_user.dt.days

In [None]:
def boolean_int(i):
    if i == False:
        return 0
    else:
        return 1

Engineer an indicator variable for whether or not a given user requested a luxury car

In [None]:
df["luxury_car_user"] = df["luxury_car_user"].apply(boolean_int)

In [None]:
df_1 = df.copy()

Create the dummies for each of the above engineered variables

In [None]:
categories = ["city", "phone", "avg_rating_of_driver", "avg_rating_by_driver"]

In [None]:
for i in categories:
    x = pd.get_dummies(df_1[i], drop_first = True, prefix = str(i)+"_")
    df_1 = pd.concat([df_1, x],axis=1)
    

In [None]:
df_1.head()

In [None]:
result_df = df_1.copy()

In [None]:
result_df.columns