In [16]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
np.random.seed(42)

In [17]:
df = pd.read_csv("s3://jb-data-set/companies.csv")
df.head(5)

Unnamed: 0,Company,Industry,Revenue,Employees,Churn
0,comp_0,IT,13780,3445,1
1,comp_1,IT,99770,49885,1
2,comp_2,service,32710,10903,0
3,comp_3,consulting,35160,8790,1
4,comp_4,finance,76550,19138,0


In [18]:
df.dtypes

Company      object
Industry     object
Revenue       int64
Employees     int64
Churn         int64
dtype: object

In [19]:
industry_type = CategoricalDtype(categories=[ "gov", "health", "retail", "travel","energy","IT","finance","consulting","service" ])

In [20]:
df['Industry'] = df['Industry'].astype(industry_type)
df['Churn'] = df['Churn'].astype('bool')

In [21]:
df=df.drop(['Company'],axis=1)

In [22]:
df = pd.get_dummies(df,columns=['Industry'])

In [23]:
df.head(10)

Unnamed: 0,Revenue,Employees,Churn,Industry_gov,Industry_health,Industry_retail,Industry_travel,Industry_energy,Industry_IT,Industry_finance,Industry_consulting,Industry_service
0,13780,3445,True,0,0,0,0,0,1,0,0,0
1,99770,49885,True,0,0,0,0,0,1,0,0,0
2,32710,10903,False,0,0,0,0,0,0,0,0,1
3,35160,8790,True,0,0,0,0,0,0,0,1,0
4,76550,19138,False,0,0,0,0,0,0,1,0,0
5,92220,30740,False,0,0,0,0,0,0,1,0,0
6,92810,23202,True,0,0,0,0,0,0,0,1,0
7,88990,44495,False,1,0,0,0,0,0,0,0,0
8,37080,9270,False,0,0,0,0,0,0,1,0,0
9,5330,2665,False,0,0,0,0,1,0,0,0,0


In [24]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('Churn', 1), df['Churn'], test_size = .2, random_state=10)

  """Entry point for launching an IPython kernel.


In [25]:
model = RandomForestClassifier(max_depth=5)

In [26]:
X_train.head(5)


Unnamed: 0,Revenue,Employees,Industry_gov,Industry_health,Industry_retail,Industry_travel,Industry_energy,Industry_IT,Industry_finance,Industry_consulting,Industry_service
31152,40520,10130,0,0,0,0,0,0,1,0,0
33499,93460,46730,0,0,1,0,0,0,0,0,0
18594,40130,13377,0,0,0,0,1,0,0,0,0
97132,80630,20158,0,0,0,0,0,0,0,1,0
56015,1360,453,0,0,0,0,0,0,0,0,1


In [27]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [28]:
y_predict = model.predict(X_test)

In [29]:
conf_matrix = confusion_matrix(y_test, y_predict)
conf_matrix

array([[13338,     0],
       [    0,  6662]])

In [30]:
tn, fp, fn, tp = conf_matrix.ravel()

In [31]:
sensitivity = tp / (tp + fn)
print('Sensitivity : ', sensitivity )

specificity = tn / (tn + fp)
print('Specificity : ', specificity)

Sensitivity :  1.0
Specificity :  1.0


In [34]:
prediction=model.predict([[92810,23200,0,0,0,0,0,0,0,1,0]])
prediction

array([ True])

In [35]:
prediction=model.predict([[92810,23200,0,0,0,0,1,0,0,0,0]])
prediction

array([False])