## This example is based on Microsoft Machine Learning example [Churn Prediction](https://docs.microsoft.com/en-us/azure/machine-learning/desktop-workbench/scenario-churn-prediction)
This is a binary classification problem.  Download data [here](https://github.com/Azure/MachineLearningSamples-ChurnPrediction/tree/master/data). 

In [12]:
import pandas as pd
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## Read and clean raw data

In [13]:
df = pd.read_csv('../data/CATelcoCustomerChurnTrainingSample.csv')
df = df.drop_duplicates() 
df = df.drop('year', 1) 
df = df.drop('month', 1) 
df = df.fillna(0) 
df.head(20)

Unnamed: 0,age,annualincome,calldroprate,callfailurerate,callingnum,customerid,customersuspended,education,gender,homeowner,...,penaltytoswitch,state,totalminsusedinlastmonth,unpaidbalance,usesinternetservice,usesvoiceservice,percentagecalloutsidenetwork,totalcallduration,avgcallduration,churn
0,12,168147,0.06,0.0,4251078442,1,Yes,Bachelor or equivalent,Male,Yes,...,371,WA,15,19,No,No,0.82,5971,663,0
1,12,168147,0.06,0.0,4251078442,1,Yes,Bachelor or equivalent,Male,Yes,...,371,WA,15,19,No,No,0.82,3981,995,0
2,42,29047,0.05,0.01,4251043419,2,Yes,Bachelor or equivalent,Female,Yes,...,43,WI,212,34,No,Yes,0.27,7379,737,0
3,42,29047,0.05,0.01,4251043419,2,Yes,Bachelor or equivalent,Female,Yes,...,43,WI,212,34,No,Yes,0.27,1729,432,0
4,58,27076,0.07,0.02,4251055773,3,Yes,Master or equivalent,Female,Yes,...,403,KS,216,144,No,No,0.48,3122,624,0
5,58,27076,0.07,0.02,4251055773,3,Yes,Master or equivalent,Female,Yes,...,403,KS,216,144,No,No,0.48,2769,553,0
6,20,137977,0.05,0.03,4251042488,4,Yes,PhD or equivalent,Male,No,...,76,KY,412,159,Yes,No,0.94,834,834,0
7,20,137977,0.05,0.03,4251042488,4,Yes,PhD or equivalent,Male,No,...,76,KY,412,159,Yes,No,0.94,5868,838,0
8,36,136006,0.07,0.0,4251073177,5,Yes,High School or below,Male,Yes,...,436,ND,416,19,No,No,0.15,1886,628,0
9,36,136006,0.07,0.0,4251073177,5,Yes,High School or below,Male,Yes,...,436,ND,416,19,No,No,0.15,2602,867,0


## Convert text features to numeric indices

In [14]:
columns_to_encode = list(df.select_dtypes(include=['category','object']))
for column_to_encode in columns_to_encode:
    le = LabelEncoder()
    df[column_to_encode] = le.fit_transform(df[column_to_encode])
df.head(20)

Unnamed: 0,age,annualincome,calldroprate,callfailurerate,callingnum,customerid,customersuspended,education,gender,homeowner,...,penaltytoswitch,state,totalminsusedinlastmonth,unpaidbalance,usesinternetservice,usesvoiceservice,percentagecalloutsidenetwork,totalcallduration,avgcallduration,churn
0,12,168147,0.06,0.0,4251078442,1,1,0,1,1,...,371,46,15,19,0,0,0.82,5971,663,0
1,12,168147,0.06,0.0,4251078442,1,1,0,1,1,...,371,46,15,19,0,0,0.82,3981,995,0
2,42,29047,0.05,0.01,4251043419,2,1,0,0,1,...,43,47,212,34,0,1,0.27,7379,737,0
3,42,29047,0.05,0.01,4251043419,2,1,0,0,1,...,43,47,212,34,0,1,0.27,1729,432,0
4,58,27076,0.07,0.02,4251055773,3,1,2,0,1,...,403,15,216,144,0,0,0.48,3122,624,0
5,58,27076,0.07,0.02,4251055773,3,1,2,0,1,...,403,15,216,144,0,0,0.48,2769,553,0
6,20,137977,0.05,0.03,4251042488,4,1,3,1,0,...,76,16,412,159,1,0,0.94,834,834,0
7,20,137977,0.05,0.03,4251042488,4,1,3,1,0,...,76,16,412,159,1,0,0.94,5868,838,0
8,36,136006,0.07,0.0,4251073177,5,1,1,1,1,...,436,27,416,19,0,0,0.15,1886,628,0
9,36,136006,0.07,0.0,4251073177,5,1,1,1,1,...,436,27,416,19,0,0,0.15,2602,867,0


## Split data into training and test data sets

In [15]:
train, test = train_test_split(df, random_state = 42, test_size = 0.3)

target = train['churn'].values
train = train.drop('churn', 1)

expected = test['churn'].values
test = test.drop('churn', 1)

train.head(20)

Unnamed: 0,age,annualincome,calldroprate,callfailurerate,callingnum,customerid,customersuspended,education,gender,homeowner,...,occupation,penaltytoswitch,state,totalminsusedinlastmonth,unpaidbalance,usesinternetservice,usesvoiceservice,percentagecalloutsidenetwork,totalcallduration,avgcallduration
17211,75,152283,0.03,0.0,4251048333,8010,1,0,0,1,...,2,471,24,162,184,0,0,0.12,1664,832
18597,65,168961,0.02,0.01,4251088692,8652,1,1,1,1,...,1,259,36,142,144,0,0,0.03,7712,771
15968,50,138023,0.05,0.02,4251020038,7430,1,0,1,1,...,2,472,40,294,14,0,0,0.37,3864,644
7285,77,239904,0.06,0.02,4251048206,3382,1,0,1,1,...,0,429,43,24,228,0,0,0.91,4372,624
8214,59,133945,0.03,0.02,4251097907,3815,1,1,1,1,...,0,227,29,95,229,0,0,0.28,52,52
3813,46,71623,0.07,0.03,4251087087,1771,1,0,1,0,...,0,3,40,284,70,0,0,0.58,2690,538
796,57,127008,0.07,0.02,4251083915,371,1,1,1,1,...,1,210,28,168,65,0,0,0.55,6064,758
10733,19,119907,0.01,0.03,4251031078,4993,1,2,0,1,...,1,426,35,368,142,1,1,0.51,1664,416
18075,35,241912,0.03,0.0,4251086980,8411,1,1,1,0,...,2,429,24,16,6,0,0,0.87,2063,515
8423,14,221484,0.0,0.03,4251093150,3913,1,1,1,1,...,1,322,43,410,103,0,0,0.97,4105,586


## Train a decision tree model

In [16]:
model = DecisionTreeClassifier(random_state=42)
model.fit(train, target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

## Evaluate the model

In [17]:
predicted = model.predict(test)

print(roc_auc_score(expected, predicted))

0.828096765593
