In [37]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import numpy as np

In [2]:
data = pd.read_csv("csv/telecom_churn.csv")

In [3]:
data.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [5]:
data.drop(['State', 'Voice mail plan'], axis=1, inplace=True)

In [6]:
data['International plan'] = data['International plan'].map({'Yes':1, 'No':0})

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Account length          3333 non-null   int64  
 1   Area code               3333 non-null   int64  
 2   International plan      3333 non-null   int64  
 3   Number vmail messages   3333 non-null   int64  
 4   Total day minutes       3333 non-null   float64
 5   Total day calls         3333 non-null   int64  
 6   Total day charge        3333 non-null   float64
 7   Total eve minutes       3333 non-null   float64
 8   Total eve calls         3333 non-null   int64  
 9   Total eve charge        3333 non-null   float64
 10  Total night minutes     3333 non-null   float64
 11  Total night calls       3333 non-null   int64  
 12  Total night charge      3333 non-null   float64
 13  Total intl minutes      3333 non-null   float64
 14  Total intl calls        3333 non-null   

In [9]:
y = data.Churn.astype('int')

In [10]:
X = data.drop(['Churn'], axis=1)

In [11]:
X.shape, y.shape

((3333, 17), (3333,))

In [19]:
from sklearn.model_selection import train_test_split, cross_val_score

In [15]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, random_state=17, test_size=0.3)

In [16]:
X_train.shape, X_holdout.shape

((2333, 17), (1000, 17))

In [17]:
first_tree = DecisionTreeClassifier(random_state=17)

In [20]:
cross_val_score(first_tree, X_train, y_train, cv=5)

array([0.9143469 , 0.91220557, 0.92077088, 0.90772532, 0.91416309])

In [22]:
cross_val_score(first_tree, X_train, y_train, cv=5).mean()

0.9138423504976518

In [23]:
from sklearn.neighbors import KNeighborsClassifier

In [28]:
first_knn = KNeighborsClassifier()

In [29]:
cross_val_score(first_knn, X_train, y_train, cv=5)

array([0.8608137 , 0.85653105, 0.875803  , 0.86266094, 0.87982833])

In [30]:
cross_val_score(first_knn, X_train, y_train, cv=5).mean()

0.8671274043984523

## Hастраиваем max_depth для дерева

In [31]:
from sklearn.model_selection import GridSearchCV

In [43]:
tree_params = {'max_depth':np.arange(1,11), 'max_features':[0.5, 0.7, 1]}

In [48]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=2)

In [50]:
%%time
tree_grid.fit(X_train, y_train)

CPU times: user 76.8 ms, sys: 10.6 ms, total: 87.4 ms
Wall time: 438 ms


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=17), n_jobs=2,
             param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
                         'max_features': [0.5, 0.7, 1]})

In [51]:
tree_grid.best_estimator_

DecisionTreeClassifier(max_depth=6, max_features=0.7, random_state=17)

In [52]:
tree_grid.best_params_

{'max_depth': 6, 'max_features': 0.7}

In [53]:
tree_grid.best_score_

0.9391366681677404

In [76]:
knn_params = {'n_neighbors':[1,2,3,4] + list(range(50,100,10))}

In [70]:
knn_grid = GridSearchCV(first_knn, knn_params, cv=5, n_jobs=2)

In [71]:
%%time
knn_grid.fit(X_train, y_train)

CPU times: user 32.6 ms, sys: 6.53 ms, total: 39.1 ms
Wall time: 555 ms


GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=2,
             param_grid={'n_neighbors': [1, 2, 3, 4, 50, 60, 70, 80, 90]})

In [74]:
knn_grid.best_estimator_, knn_grid.best_params_, knn_grid.best_score_

(KNeighborsClassifier(n_neighbors=4), {'n_neighbors': 4}, 0.8658416887998456)

In [81]:
knn_params1 = {'n_neighbors':list(range(5,30,5))}

In [82]:
knn_grid1 = GridSearchCV(first_knn, knn_params1, cv=5, n_jobs=2)

In [83]:
%%time
knn_grid1.fit(X_train, y_train)

CPU times: user 31.3 ms, sys: 6.6 ms, total: 37.9 ms
Wall time: 337 ms


GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=2,
             param_grid={'n_neighbors': [5, 10, 15, 20, 25]})

In [84]:
knn_grid1.best_estimator_, knn_grid1.best_params_, knn_grid1.best_score_

(KNeighborsClassifier(n_neighbors=10), {'n_neighbors': 10}, 0.8701289391697531)

In [86]:
tree_holdout_predict = tree_grid.predict(X_holdout)

In [87]:
tree_grid.score(X_holdout, y_holdout)

0.936

In [88]:
from sklearn.metrics import accuracy_score

In [90]:
accuracy_score(y_holdout, tree_holdout_predict)

0.936

In [91]:
1 - np.mean(y)

0.8550855085508551

In [92]:
from sklearn.tree import export_graphviz

In [93]:
export_graphviz(tree_grid.best_estimator_, out_file="telecom_3", feature_names=X.columns,filled=True)

In [95]:
!ls

Topic3.1.ipynb            screen1.png               topic3_decision_tree2.png
Videolecture.ipynb        telecom_3                 topic3_decision_tree3.png
[34mcsv[m[m                       topic3_decision_tree1.png


In [104]:
!open tree.png

In [97]:
export_graphviz?

In [103]:
!dot -Tpng telecom_3 -o tree.png

<img src="tree.png">