In [20]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [3]:
data = pd.read_csv('../../data/telecom_churn.csv')

In [5]:
data.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [6]:
data.drop(['State', 'Voice mail plan'], axis=1, inplace=True)
data.head()

Unnamed: 0,Account length,Area code,International plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,128,415,No,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,107,415,No,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,137,415,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,84,408,Yes,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,75,415,Yes,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [8]:
data['International plan'] = data['International plan'].map({'Yes': 1, 'No': 0})
data.head()

Unnamed: 0,Account length,Area code,International plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,128,415,0,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,107,415,0,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,137,415,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,84,408,1,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,75,415,1,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 18 columns):
Account length            3333 non-null int64
Area code                 3333 non-null int64
International plan        3333 non-null int64
Number vmail messages     3333 non-null int64
Total day minutes         3333 non-null float64
Total day calls           3333 non-null int64
Total day charge          3333 non-null float64
Total eve minutes         3333 non-null float64
Total eve calls           3333 non-null int64
Total eve charge          3333 non-null float64
Total night minutes       3333 non-null float64
Total night calls         3333 non-null int64
Total night charge        3333 non-null float64
Total intl minutes        3333 non-null float64
Total intl calls          3333 non-null int64
Total intl charge         3333 non-null float64
Customer service calls    3333 non-null int64
Churn                     3333 non-null bool
dtypes: bool(1), float64(8), int64(9)
memory usage

In [10]:
y = data['Churn'].astype('int')

In [11]:
X = data.drop('Churn', axis=1)

In [12]:
X.shape, y.shape

((3333, 17), (3333,))

In [18]:
from sklearn.model_selection import train_test_split, cross_val_score

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=17)

In [15]:
X_train.shape, X_valid.shape

((2333, 17), (1000, 17))

In [16]:
y_train.shape, y_valid.shape

((2333,), (1000,))

In [17]:
first_tree = DecisionTreeClassifier(random_state=17)

In [19]:
cross_val_score(first_tree, X_train, y_train, cv=5)

array([0.9143469 , 0.91220557, 0.92291221, 0.90772532, 0.91416309])

In [21]:
np.mean(cross_val_score(first_tree, X_train, y_train, cv=5))

0.9142706160222772

In [22]:
from sklearn.neighbors import KNeighborsClassifier

In [23]:
first_knn = KNeighborsClassifier()

In [25]:
np.mean(cross_val_score(first_knn, X_train, y_train, cv=5))

0.8671274043984523

## Настраиваем `max_depth` для дерева

In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
param_grid = {
    'max_depth': np.arange(1, 11),
    'max_features': [0.5, 0.7, 1]
}

tree_grid = GridSearchCV(first_tree, param_grid, cv=5, n_jobs=-1)

In [29]:
%%time
tree_grid.fit(X_train, y_train)

CPU times: user 736 ms, sys: 86.5 ms, total: 822 ms
Wall time: 1.43 s


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), 'max_features': [0.5, 0.7, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
tree_grid.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=0.7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best')

In [31]:
tree_grid.best_score_

0.9391341620231461

In [32]:
tree_grid.best_params_

{'max_depth': 6, 'max_features': 0.7}

## Настраиваем `knn`

In [44]:
param_grid = {
    'n_neighbors': [1, 2, 3, 4],
    'n_neighbors': np.arange(5, 30, 5),
    'n_neighbors': np.arange(50, 100, 10)
}

knn_grid = GridSearchCV(first_knn, param_grid, cv=5, n_jobs=-1)

In [45]:
%%time
knn_grid.fit(X_train, y_train)

CPU times: user 550 ms, sys: 83.8 ms, total: 634 ms
Wall time: 2.28 s


GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_neighbors': array([50, 60, 70, 80, 90])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [46]:
knn_grid.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=50, p=2,
           weights='uniform')

In [47]:
knn_grid.best_score_

0.8624089155593656

In [48]:
knn_grid.best_params_

{'n_neighbors': 50}

In [49]:
tree_pred = tree_grid.best_estimator_.predict(X_valid)

In [50]:
from sklearn.metrics import accuracy_score

In [51]:
accuracy_score(y_valid, tree_pred)

0.936

In [52]:
1 - np.mean(y)

0.8550855085508551

In [53]:
from sklearn.tree import export_graphviz

In [54]:
export_graphviz(tree_grid.best_estimator_, out_file='telecom_tree.dot', feature_names=X.columns, filled=True)

In [55]:
!ls -l *.dot

-rw-r--r-- 1 ivanmagda staff 8071 Feb 27 11:48 telecom_tree.dot


In [58]:
!dot -Tpng telecom_tree.dot -o telecom_tree.png



<img src='telecom_tree.png'>