In [13]:
from __future__ import division
import pandas as pd
import numpy as np

churn_df = pd.read_csv("churn.csv")
col_names = churn_df.columns.tolist()

print("Column_names:")
print(col_names)

to_show = col_names[:6]+col_names[-6:]
print("\nSample_data:")
churn_df[to_show].head(3)

Column_names:
['State', 'Account Length', 'Area_Code', 'Phone', 'Int_Plan', 'VMail_Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'Churn']

Sample_data:


Unnamed: 0,State,Account Length,Area_Code,Phone,Int_Plan,VMail_Plan,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn
0,KS,128,415,382-4657,no,yes,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,7.32,12.2,5,3.29,0,False.


In [21]:
churn_result = churn_df["Churn"]
y = np.where(churn_result == 'True.',1,0)

to_drop = ['State','Area_Code','Phone','Churn']
churn_feat_space = churn_df.drop(to_drop,axis=1)

yes_no_cols = ["Int_Plan","VMail_Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

feaures = churn_feat_space.columns

X = churn_feat_space.as_matrix().astype(np.float)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

print("Feature space holds %d observations and %d features"% X.shape)
print("Unique target labels:",np.unique(y))
print(X[0])
print(len(y[y == 0]))

  app.launch_new_instance()


Feature space holds 3333 observations and 17 features
Unique target labels: [0 1]
[ 0.67648946 -0.32758048  1.6170861   1.23488274  1.56676695  0.47664315
  1.56703625 -0.07060962 -0.05594035 -0.07042665  0.86674322 -0.46549436
  0.86602851 -0.08500823 -0.60119509 -0.0856905  -0.42793202]
2850


In [22]:
from sklearn.cross_validation import KFold

def run_cv(X,y,clf_class,**kwargs):
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()

    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred



In [23]:
from sklearn.svm import SVC  #支持向量机
from sklearn.ensemble import RandomForestClassifier as RF  #随机森林
from sklearn.neighbors import KNeighborsClassifier as KNN  #k最近邻

def accuracy(y_true,y_pred):
    return np.mean(y_true == y_pred)

print("Support vector machines:")
print("%.3f" % accuracy(y, run_cv(X,y,SVC)))
print("Random forest:")
print("%.3f" % accuracy(y, run_cv(X,y,RF)))
print("K-nearest-neighbors:")
print("%.3f" % accuracy(y, run_cv(X,y,KNN)))

  from numpy.core.umath_tests import inner1d


Support vector machines:
0.920
Random forest:
0.944
K-nearest-neighbors:
0.893


In [24]:
from sklearn.cross_validation import KFold
def run_prob_cv(X,y,clf_class,**kwargs):
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_prob = np.zeros((len(y),2))
    
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob


In [25]:
import warnings
warnings.filterwarnings('ignore')

pred_prob = run_prob_cv(X,y,RF,n_estimators=10)

pred_churn = pred_prob[:,1]
is_churn = y == 1

counts = pd.value_counts(pred_churn)

true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

counts = pd.concat([counts,true_prob],axis=1).reset_index()
counts.columns = ["pred_prob","count","true_prob"]
counts


Unnamed: 0,pred_prob,count,true_prob
0,0.0,1778,0.024184
1,0.1,709,0.038082
2,0.2,246,0.04878
3,0.3,125,0.096
4,0.8,83,0.951807
5,0.9,79,0.987342
6,1.0,70,0.985714
7,0.6,65,0.815385
8,0.7,65,0.846154
9,0.5,58,0.568966
