In [4]:
import pandas as pd
import numpy as np

churn_df = pd.read_csv(
    "E:\Workspace\jupyter_notebook\\notebook_idata_lesson01\Churn\churn.csv"
)
churn_df[:5]


Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


In [8]:
# 标签值
churn_result = churn_df["Churn?"]
# True.替换为1
y = np.where(churn_result == "True.", 1, 0)

# 删去一些列
to_drop = ["State", "Area Code", "Phone", "Churn?"]
churn_feat_space = churn_df.drop(to_drop, axis=1)

# 把值为yes/no的列，转换到true和false
yes_no_cols = ["Int'l Plan", "VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == "yes"

features = churn_feat_space.columns

X = churn_feat_space.as_matrix().astype(np.float)
print(X)

# 标准化数据
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

print(X)


[[128.     0.     1.   ...   3.     2.7    1.  ]
 [107.     0.     1.   ...   3.     3.7    1.  ]
 [137.     0.     0.   ...   5.     3.29   0.  ]
 ...
 [ 28.     0.     0.   ...   6.     3.81   2.  ]
 [184.     1.     0.   ...  10.     1.35   2.  ]
 [ 74.     0.     1.   ...   4.     3.7    0.  ]]


[[ 0.67648946 -0.32758048  1.6170861  ... -0.60119509 -0.0856905
  -0.42793202]
 [ 0.14906505 -0.32758048  1.6170861  ... -0.60119509  1.2411686
  -0.42793202]
 [ 0.9025285  -0.32758048 -0.61839626 ...  0.21153386  0.69715637
  -1.1882185 ]
 ...
 [-1.83505538 -0.32758048 -0.61839626 ...  0.61789834  1.3871231
   0.33235445]
 [ 2.08295458  3.05268496 -0.61839626 ...  2.24335625 -1.87695028
   0.33235445]
 [-0.67974475 -0.32758048  1.6170861  ... -0.19483061  1.2411686
  -1.1882185 ]]


In [9]:
from sklearn.cross_validation import KFold

def run_cv(X, y, clf_class, **kwargs):
    # Construct a kfolds object
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train, y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred




In [11]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN

def accuracy(y_true, y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

# 打印精度
# SVM
print(accuracy(y, run_cv(X, y, SVC)))
# 随机森林
print(accuracy(y, run_cv(X, y, RF)))
# KNN
print(accuracy(y, run_cv(X, y, KNN)))


0.9186918691869187


0.9471947194719472


0.894989498949895


In [12]:
def run_prob_cv(X, y, clf_class, **kwargs):
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y), 2))
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train, y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob


In [13]:
import warnings

warnings.filterwarnings('ignore')

# Use 10 estimators so predictions are all multiples of 0.1
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
#print pred_prob[0]
pred_churn = pred_prob[:, 1]
is_churn = y == 1

# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
#print counts

# calculate true probabilities
true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

# pandas-fu
counts = pd.concat([counts, true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts


Unnamed: 0,pred_prob,count,true_prob
0,0.0,1749,0.028588
1,0.1,708,0.025424
2,0.2,257,0.066148
3,0.3,132,0.121212
4,0.4,85,0.341176
5,0.5,78,0.653846
6,0.9,77,0.974026
7,1.0,64,1.0
8,0.6,63,0.809524
9,0.7,61,0.885246
