In [1]:
# standard modules
import seaborn as sns
import pandas as pd
import numpy as np
import os
#import math

# Modules for Displaying Figures
import matplotlib.pyplot as plt
import scipy.stats as stats


# Data Science Modules 
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# My modules
import src.acquire as ac
import src.prepare as pp
import src.helper as helper
import src.evaluate as evaluate

# Turn off the red warnings
import warnings
warnings.filterwarnings("ignore")

The following datasets are available:
telco


In [2]:
# begin gathering our data and transforming it to useful for analysis
telco = ac.get_telco_data()
train, validate, test, x_train, y_train, x_validate, y_validate, x_test, y_test = pp.model_telco_data(telco)
base = evaluate.baseline(telco, 'churn')

# running models before discussion begins
decision_tree = evaluate.decision_tree_model(x_train, y_train, x_test, y_test)
knn7 = evaluate.knn7_model(x_train, y_train, x_test, y_test)
knn10 = evaluate.knn10_model(x_train, y_train, x_test, y_test)
models_compared = [base, decision_tree, knn7, knn10]
df = ac.get_telco_data()

In [3]:
no_churn =df.churn=='No'

In [4]:
telco['churn'].value_counts(normalize=True)[1]

0.2653698707936959

In [5]:
no_churn =df.churn=='No'
avg_no_churn_months = df[no_churn]['tenure'].mean()
avg_no_churn_months

37.56996521066873

In [6]:
yes_churn =df.churn=='Yes'
avg_yes_churn_months = df[yes_churn]['tenure'].mean()
avg_yes_churn_months

17.979133226324237

In [9]:
potential_non_churn=(len(df[df.churn =='Yes']))*(knn10-base)
extra_months = avg_no_churn_months - avg_yes_churn_months
avg_charges = df['monthly_charges'].mean()
potential_non_churn * extra_months * avg_charges * 0.95

114276.90882214584

In [8]:
evaluate.potential_income()

114276.90882214584

In [16]:
telco.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 7043 non-null   object 
 1   senior_citizen         7043 non-null   int64  
 2   partner                7043 non-null   object 
 3   dependents             7043 non-null   object 
 4   tenure                 7043 non-null   int64  
 5   phone_service          7043 non-null   object 
 6   multiple_lines         7043 non-null   object 
 7   online_security        7043 non-null   object 
 8   online_backup          7043 non-null   object 
 9   device_protection      7043 non-null   object 
 10  tech_support           7043 non-null   object 
 11  streaming_tv           7043 non-null   object 
 12  streaming_movies       7043 non-null   object 
 13  paperless_billing      7043 non-null   object 
 14  monthly_charges        7043 non-null   float64
 15  tota

In [6]:
target = train.iloc[:,0]
independent = train.iloc[:,1:]

In [25]:
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(independent,target)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(independent.columns)

#concat two dataframes for better visualization 
feature_scores = pd.concat([dfcolumns,dfscores],axis=1)

# renaming the two columns to make sense
feature_scores.columns = ['customer_features','score']


feature_scores = feature_scores.nlargest(10, 'score')

In [30]:
feature_scores['customer_features']

1                                tenure
25        payment_type_Electronic check
21               contract_type_Two year
22    internet_service_type_Fiber optic
Name: customer_features, dtype: object