In [25]:
import sqlite3
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd

In [31]:
# open the connection
db = sqlite3.connect('data/mydb')
cur = db.cursor()

In [14]:
# select all the rows in the members dataset where the id is in the training dataset
# this will be my preliminary training set
cur.execute('''SELECT city, bd, gender, registered_via, registration_init_time, is_churn 
            FROM training 
            INNER JOIN members ON training.msno=members.msno''')
training = np.array(cur.fetchall())

In [20]:
X = training[:, :-1]
y = training[:, -1]

In [21]:
y.shape

(699492,)

In [22]:
X.shape

(699492, 5)

In [24]:
X


array([['13', '20', 'male', '3', '20131223'],
       ['13', '0', '', '9', '20140307'],
       ['1', '0', '', '3', '20140402'],
       ..., 
       ['13', '28', 'male', '7', '20110831'],
       ['1', '0', '', '7', '20110903'],
       ['1', '0', '', '7', '20110905']],
      dtype='<U21')

# Pipeline using cleaned data

In [27]:
store = pd.HDFStore('store.h5')

In [29]:
members = store['members']

In [36]:
training = pd.read_csv('data/churn_comp_refresh/train_v2.csv')

In [45]:
# select only the intersection between training and members tables
members_training = members.merge(training, on='msno', how='inner')

In [46]:
members_training.head()

Unnamed: 0,msno,age,age_is_null,gender_is_null,days_since_start_of_membership,city_1,city_3,city_4,city_5,city_6,...,city_21,city_22,gender_female,gender_male,registered_via_3,registered_via_4,registered_via_7,registered_via_9,registered_via_13,is_churn
0,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,-1,0,1,2056,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,I0yFvqMoNkM8ZNHb617e1RBzIS/YRKemHO7Wj13EtA0=,63,0,0,2052,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,OoDwiKZM+ZGr9P3fRivavgOtglTEaNfWJO4KaJcTTts=,-1,0,1,2052,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,4De1jAxNRABoyRBDZ82U0yEmzYkqeOugRGVNIf92Xb8=,28,0,0,2050,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
4,Z6WIOK9vXy+e2XDBiioNAxuZ0ScXSU/Ebq4tUwqVSrE=,38,0,0,2041,0,0,0,0,0,...,0,1,1,0,0,0,0,1,0,0


In [68]:
X = np.array(members_training.iloc[:, 1:-1])
X_columns = members_training.iloc[:, 1:-1].columns

In [59]:
y = np.array(members_training.iloc[:, -1])

In [84]:
# what is the base rate for churn?
print('The null error rate is {:.2f}%'.format(y.mean()*100))

The null error rate is 1.87%


In [88]:
sum(y)/len(y)

0.018726447193105854

with such a low base rate, we could predict accurately 98% the time by always predicting 'does not churn'

In [91]:
print('base rate in the training set: {:.2f}%'.format(sum(training['is_churn'])/len(training)*100))


base rate in the training set: 8.99%


when we excluded all the training rows that aren't in members, we massively decreased the 
base rate in the training set

In [66]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X, y)

# check the accuracy on the training set
model.score(X, y)

0.9812735528068941

In [73]:
# examine the coefficients
pd.DataFrame(list(zip(X_columns, np.transpose(model.coef_))))

Unnamed: 0,0,1
0,age,[-0.0175070902362]
1,age_is_null,[0.0]
2,gender_is_null,[-1.07870299079]
3,days_since_start_of_membership,[-8.51685166857e-05]
4,city_1,[0.00163271292506]
5,city_3,[-0.178555545608]
6,city_4,[-0.132966572812]
7,city_5,[-0.159712304749]
8,city_6,[-0.0977003244307]
9,city_7,[0.0885354030088]


### Model evaluation

In [74]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
model2 = LogisticRegression()
model2.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [76]:
# predict class labels for the test set
predicted = model2.predict(X_test)
print(predicted)

[0 0 0 ..., 0 0 0]


In [77]:
# as I suspected, the model is just learning to predict 'not churn' every time.
any(predicted)

False