In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split, cross_validate, learning_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc, classification_report

%matplotlib inline

In [1]:
!pip install pandas numpy seaborn sklearn matplotlib



In [None]:
df = pd.read_csv('data/churn.csv', index_col=0)
df.head()

In [None]:
labels = df.churn
features = df.drop('churn', axis=1)
features.head(20)

In [None]:
drop_columns = ['host_response_time', 
                'calendar_updated',
                'days_since_last_review']

In [None]:
features.drop(drop_columns, axis=1).info()

In [None]:
pd.get_dummies(features.drop(drop_columns, axis=1))

In [None]:
feature_matrix = pd.get_dummies(features.drop(drop_columns, axis=1)).values
feature_matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.3)

In [None]:
print(len(X_train))
print(len(X_test))

In [None]:
# create model (estimator) object
model = LogisticRegression(solver='liblinear')
model

In [None]:
model.get_params()

In [None]:
# fit model to training data
model.fit(X_train, y_train)

In [None]:
model.coef_

In [None]:
# make predictions
predictions = model.predict(X_test)
predictions

In [None]:
model.predict_proba(X_test)

In [None]:
# evaluate model
accuracy = (predictions == y_test).sum() / len(y_test)
accuracy

In [None]:
model.score(X_test, y_test)

In [None]:
df.churn.value_counts()

## Class Imbalance

In [None]:
down_sample = df[df.churn == False].sample(32575)
sub_sample = pd.concat([down_sample, df[df.churn == True].sample(32575)])

In [None]:
sub_sample.churn.value_counts()

In [None]:
labels = sub_sample.churn
drop_columns = ['host_response_time', 
                'calendar_updated',
                'days_since_last_review', 'churn']
features = sub_sample.drop(drop_columns, axis=1)
feature_matrix = pd.get_dummies(features).values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.3)

In [None]:
# create model (estimator) object
clf = LogisticRegression(penalty='l1', solver='liblinear')

# fit model to training data
clf.fit(X_train, y_train)

# make predictions
predictions = clf.predict(X_test)

# evaluate model
accuracy = (predictions == y_test).sum() / len(y_test)
print(accuracy)

In [None]:
clf.score(X_test, y_test)

In [None]:
print(f"Training took {clf.n_iter_[0]} iterations, since this is less than {clf.max_iter} it has converged")

## k-fold Cross Validation

In [None]:
scores = cross_val_score(clf, X_train, y_train, cv=5)

In [None]:
cross_val_score??

In [None]:
scores

In [None]:
scores.mean()

In [None]:
scores.var()

In [None]:
np.array([0.2, .2, .9, .8, .9, .5, .8, .7, .7, .7]).mean()

In [None]:
np.array([0.2, .2, .9, .8, .9, .5, .8, .7, .7, .7]).var()

In [None]:
clf.score(X_test, y_test)

In [None]:
model = KNeighborsClassifier()
clf = model.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
clf = model.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
clf.feature_importances_

In [None]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
model = RandomForestClassifier()
clf = model.fit(X_train, y_train)
clf.score(X_test, y_test)

y_pred = clf.predict(X_test)
y_score = clf.predict_proba(X_test)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure(figsize=(5, 5))
plot_confusion_matrix(cm)

In [None]:
model = KNeighborsClassifier()
clf = model.fit(X_train, y_train)
clf.score(X_test, y_test)

y_pred = clf.predict(X_test)
y_score = clf.predict_proba(X_test)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure(figsize=(5, 5))
plot_confusion_matrix(cm)

In [None]:
y_pred

In [None]:
y_score

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
model = KNeighborsClassifier()
clf = model.fit(X_train, y_train)
clf.score(X_test, y_test)

y_pred = clf.predict(X_test)
y_score = clf.predict_proba(X_test)


fpr, tpr, thresholds = roc_curve(y_test, y_score[:, 1])

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC plot for Logistic Regression for Airbnb Churn')
plt.legend(loc="lower right")
plt.show()

In [None]:
model = RandomForestClassifier()
clf = model.fit(X_train, y_train)
clf.score(X_test, y_test)

y_pred = clf.predict(X_test)
y_score = clf.predict_proba(X_test)


fpr, tpr, thresholds = roc_curve(y_test, y_score[:, 1])

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC plot for Logistic Regression for Airbnb Churn')
plt.legend(loc="lower right")
plt.show()

In [None]:
def plot_roc(y_test, y_score):
    fpr, tpr, thresholds = roc_curve(y_test, y_score[:, 1])

    plt.figure(figsize=(10,10))
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC plot for Logistic Regression for Airbnb Churn')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
scores = cross_validate(clf, X_train, y_train, return_train_score=True, cv=5)

In [None]:
scores