Rachapudi
March 31 2020

Below is an implementation of a logistic regression model to predict the likelihood of the outcome variable being equal to 1. 

Estimated AUC: 0.826

In [97]:
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score



In [77]:
# read in data and view first 10 rows
df = pd.read_csv('data/train.csv')
df.head(10)

Unnamed: 0,age,cost_of_ad,device_type,gender,in_initial_launch_location,income,n_drivers,n_vehicles,prior_ins_tenure,outcome
0,56,0.005737,iPhone,M,0,62717,2,1,4,0
1,50,0.004733,desktop,F,0,64328,2,3,2,0
2,54,0.004129,laptop,M,0,83439,1,3,7,0
3,16,0.005117,Android,F,0,30110,2,3,0,0
4,37,0.003635,desktop,M,0,76565,2,1,5,0
5,18,0.004757,other,F,1,41700,1,3,1,0
6,20,0.00458,Android,F,0,26619,1,3,0,0
7,45,0.005152,Android,F,1,62496,2,1,10,0
8,59,0.004969,laptop,F,0,89294,1,3,5,0
9,56,0.003961,desktop,M,0,76673,2,3,1,0


In [78]:
# get size of data
df.shape

(10000, 10)

In [79]:
# get column names
list(df.columns)

['age',
 'cost_of_ad',
 'device_type',
 'gender',
 'in_initial_launch_location',
 'income',
 'n_drivers',
 'n_vehicles',
 'prior_ins_tenure',
 'outcome']

In [80]:
# get types of variables
df.dtypes

age                             int64
cost_of_ad                    float64
device_type                    object
gender                         object
in_initial_launch_location      int64
income                          int64
n_drivers                       int64
n_vehicles                      int64
prior_ins_tenure                int64
outcome                         int64
dtype: object

In [81]:
# unpack categorical variable 'device_type'
df['device_type'].unique()

array(['iPhone', 'desktop', 'laptop', 'Android', 'other'], dtype=object)

In [82]:
# unpack categorical variable 'gender'
df['gender'].unique()

array(['M', 'F', nan], dtype=object)

In [83]:
# create dummy variables from categorical 
subset = df.drop(columns = ['gender', 'device_type'])
genders = pd.get_dummies(df['gender'])
devices = pd.get_dummies(df['device_type'], prefix="device")

In [84]:
# add dummy-coded columns to data
df2 = subset.join(genders.loc[:,:])
data = df2.join(devices.loc[:,:])

In [85]:
data.head(10)

Unnamed: 0,age,cost_of_ad,in_initial_launch_location,income,n_drivers,n_vehicles,prior_ins_tenure,outcome,F,M,device_Android,device_desktop,device_iPhone,device_laptop,device_other
0,56,0.005737,0,62717,2,1,4,0,0,1,0,0,1,0,0
1,50,0.004733,0,64328,2,3,2,0,1,0,0,1,0,0,0
2,54,0.004129,0,83439,1,3,7,0,0,1,0,0,0,1,0
3,16,0.005117,0,30110,2,3,0,0,1,0,1,0,0,0,0
4,37,0.003635,0,76565,2,1,5,0,0,1,0,1,0,0,0
5,18,0.004757,1,41700,1,3,1,0,1,0,0,0,0,0,1
6,20,0.00458,0,26619,1,3,0,0,1,0,1,0,0,0,0
7,45,0.005152,1,62496,2,1,10,0,1,0,1,0,0,0,0
8,59,0.004969,0,89294,1,3,5,0,1,0,0,0,0,1,0
9,56,0.003961,0,76673,2,3,1,0,0,1,0,1,0,0,0


In [86]:
data['outcome'].value_counts()

0    9018
1     982
Name: outcome, dtype: int64

In [87]:
# create target and features 
x_temp = data
x_temp = x_temp.drop('outcome', axis=1)
x_temp.head(10)
y = data['outcome']

In [88]:
# Normalize data
scaler = StandardScaler().fit(x_temp)
X = scaler.transform(x_temp)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until


In [89]:
# Split data into training set and testing set 
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [90]:
X_train.size

105000

In [91]:
# Fit logistic regression model with training data
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [92]:
model.coef_

array([[-0.18596123,  0.7104512 ,  0.4726943 ,  0.16699067,  0.41406644,
        -0.84505246, -0.13717804, -0.24898451,  0.73372451,  0.41058407,
         0.43046507, -0.40585868, -0.1710202 , -0.27195306]])

In [93]:
# Standardize X test data and use to predict outcomes
scaled_test = scaler.transform(X_test)
y_pred = model.predict(scaled_test)

In [94]:
# Create confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[1159 1098]
 [ 168   75]]


In [95]:
# Output accuracy on test data
model.score(X_test, y_test)

0.9108

In [96]:
# Area under curve statistic
roc_auc_score(y_test, model.predict_proba(X_test)[:,1])

0.826378290859165

In [65]:
test_df = pd.read_csv('data/test.csv')
test_df.head(10)

Unnamed: 0,age,cost_of_ad,device_type,gender,in_initial_launch_location,income,n_drivers,n_vehicles,prior_ins_tenure
0,34,0.005134,Android,F,1,40376,1,3,7
1,53,0.005223,desktop,F,1,84511,1,1,11
2,46,0.004939,laptop,F,0,79322,1,1,4
3,36,0.004924,Android,F,0,63295,1,2,0
4,28,0.005146,other,F,1,36170,1,3,3
5,51,0.006242,iPhone,F,0,60520,1,1,14
6,20,0.003534,desktop,M,0,59324,1,1,0
7,35,0.004568,Android,F,0,37002,2,3,5
8,32,0.004713,Android,F,1,45207,1,2,7
9,33,0.006178,iPhone,F,0,72587,1,2,6


In [23]:
# create dummy variables from categorical 
subset = test_df.drop(columns = ['gender', 'device_type'])
genders = pd.get_dummies(test_df['gender'])
devices = pd.get_dummies(test_df['device_type'], prefix="device")

In [24]:
# add dummy-coded columns to data
df2 = subset.join(genders.loc[:,:])
test_data = df2.join(devices.loc[:,:])

In [25]:
predicted = model.predict(test_data)
predicted[:5, ]

array([1, 1, 1, 1, 1])