In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.metrics import confusion_matrix, recall_score 
from sklearn.metrics import precision_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import acquire
import prepare

import warnings
warnings.filterwarnings("ignore")

## Acquire and Prep

In [12]:
# Acquire Telco Data
df = acquire.get_telco_data()

Reading from csv file...


In [13]:
# Use prep_telco to clean and make ready foe exploration
df = prepare.prep_telco(df)

In [14]:
df.head()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,is_male,has_partner,has_dependents,phone_service,no_phone_service,has_multiple_lines,...,streaming_movies,paperless_billing,churn,one_year_contract,two_year_contract,has_fiber_optic,no_internet,credit_card_auto,electronic_check_nonauto,mailed_check_nonauto
0,0,9,65.6,593.3,0,1,1,1,0,0,...,0,1,0,1,0,0,0,0,0,1
1,0,9,59.9,542.4,1,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
2,0,4,73.9,280.85,1,0,0,1,0,0,...,0,1,1,0,0,1,0,0,1,0
3,1,13,98.0,1237.85,1,1,0,1,0,0,...,1,1,1,0,0,1,0,0,1,0
4,1,3,83.9,267.4,0,1,0,1,0,0,...,0,1,1,0,0,1,0,0,0,1


In [15]:
df.churn.mean()

0.2653698707936959

In [16]:
df.shape

(7043, 31)

In [17]:
df.describe()

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,is_male,has_partner,has_dependents,phone_service,no_phone_service,has_multiple_lines,...,streaming_movies,paperless_billing,churn,one_year_contract,two_year_contract,has_fiber_optic,no_internet,credit_card_auto,electronic_check_nonauto,mailed_check_nonauto
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,...,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692,2279.734304,0.504756,0.483033,0.299588,0.903166,0.096834,0.421837,...,0.387903,0.592219,0.26537,0.209144,0.240664,0.439585,0.216669,0.216101,0.335794,0.22888
std,0.368612,24.559481,30.090047,2266.79447,0.500013,0.499748,0.45811,0.295752,0.295752,0.493888,...,0.487307,0.491457,0.441561,0.406726,0.427517,0.496372,0.412004,0.411613,0.472301,0.420141
min,0.0,0.0,18.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,9.0,35.5,398.55,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,29.0,70.35,1394.55,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,55.0,89.85,3786.6,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
max,1.0,72.0,118.75,8684.8,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
train_validate_test_split(df, df.churn, seed=123)

NameError: name 'train_validate_test_split' is not defined

In [None]:
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [None]:
# Stratify with categorical target variables
train, validate, test = train_validate_test_split(df, target='churn_Yes')
train.shape, validate.shape, test.shape

## Exploration

In [None]:
df.describe().T

In [None]:
plt.title('Churn')
train.churn_Yes.hist()
plt.show()

print('Percent of Churned Customers')
round(train.churn_Yes.mean(),3)

#### Tenure's Effect on Customer Churn

In [None]:
plt.title("Tenure's Effect on Churn")
sns.barplot(x="tenure", 
            y="churn_Yes", 
            data=train,
            )
churn_rate = train.churn_Yes.mean()
plt.axhline(churn_rate, label="Population churn rate")
plt.legend()
plt.ylabel('Percent Churned')
plt.xlabel('Tenure')
plt.figure(12,5)
plt.show()

In [None]:
plt.hist(train.monthly_charges, bins=20, color='dodgerblue')

plt.title('Monthly Charges')
plt.xlabel('Dollars')
plt.ylabel('Number of Customers')
plt.show()



In [None]:
plt.title("Tech Supports Effect")
sns.barplot(x="tech_support_Yes", y="churn_Yes", data=train)
population_survival_rate = train.churn_Yes.mean()
plt.axhline(population_survival_rate, label="Population churn rate")
plt.legend()
plt.show()

tech_support_churned = 
print('Percent of Tech Support Churned')
round(train.tech_support_Yes.churn_Yes.mean(),3)

In [None]:
train.corr()['churn_Yes'].sort_values()

In [None]:
sns.countplot(x='contract_type', hue = 'churn_Yes')