In [1]:
#libraries
import acquire
import explore
import prepare
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = acquire.get_telco_data()
df.head(1)

Unnamed: 0,payment_type_id,payment_type,internet_service_type_id,internet_service_type,contract_type_id,contract_type,customer_id,gender,senior_citizen,partner,...,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn
0,1,Electronic check,1,DSL,1,Month-to-month,0015-UOCOJ,Female,1,No,...,Yes,No,No,No,No,No,Yes,48.2,340.35,No


In [3]:
df = prepare.clean_data(df)
df.head(1)

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,bank_transfer,credit_card,e_check,check,dsl,fiber,...,two_year_contract,is_male,partner,multiple_lines,device_protection,tech_support,streaming_tv,streaming_movies,paperless,churned
0,1,7,48.2,340.35,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [6]:
train, validate, test = prepare.train_validate_test_split(df, 'churned')

In [8]:
print(train.shape, validate.shape, test.shape)

(3943, 34) (1691, 34) (1409, 34)


In [None]:
#split data
def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [None]:
df = prepare.train_validate_test_split(df, 'churned')

In [None]:
#split into train, validate, test
train, validate, test = train_validate_test_split()
train.head(2)

In [None]:
print(train.shape, validate.shape, test.shape)

In [None]:
# create X & y version of train, where y is a series with just the target variable and X are all the features.
X_train = train.drop(columns=["churned"])
y_train = train.churned

X_validate = validate.drop(columns=["churned"])
y_validate = validate.churned

X_test = test.drop(columns=["churned"])
y_test = test.churned

In [None]:
df.info()

### This will be in the prepare module
    _Check for and remove duplicates by customer_id. - None
    
    _Remove redundant columns: payment_type_id, internet_service_type_id, contract_type_id, & customer_id. - Completed
        -Used the following to drop redundant columns
        -df = df.drop(['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'], axis = 1)
        -df.head(2)
    
    _Encode payment_type(Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)), internet_service_type(DSL, Fiber, None), contract_type(Month-to-month, One year, Two year), gender(male/female), partner(yes/no), multiple_line(yes/no), device_protection(yes/no), tech_support(yes/no), streaming_tv(yes/no), streaming_movies(yes/no), paperless_billing, & churn (yes/no).
    - gender is male(1) or not male(0)
    - yes(1) no(0)
    - payment, contract and service types (True = 1) (Fales = 0)
    
    _Check for nulls and fill (Fill in total_charges' blanks with 0). - Completed
        -#find empty values. Tried isna, isnull, notna, notnull but did not shown any matches.
            -df.eq(' ').sum()
        -Used .replace to empty values with a 0 in total charges.
            -df = df.replace({'total_charges': ' '}, 0)
            -df.head()

    _Convert total_charges to float64. - Completed
        -#convert 'total_charges' to float and validate change.
        - df['total_charges'] = df['total_charges'].astype(float)
        - df.dtypes
    
    _Replace 'No phone service' & 'No internet service' with 'No'. - Completed
        -df.replace(to_replace = 'No internet service', value = 'No')
        -df.replace(to_replace = 'No phone service', value = 'No')

In [None]:
#Looking for unique values in "types"
df.contract_type.unique()

In [None]:
#check to see if there are any duplicates by 'customer_id'
df.duplicated(subset = 'internet_service').unique()

In [None]:
#find empty values. Tried isna, isnull, notna, notnull but did not shown any matches.
df.eq(' ').sum()

In [None]:
#replaces empty values with a 0 in total charges.
df = df.replace({'total_charges': ' '}, 0)
df.head()

In [None]:
#verify empty values in total_charges have been replace.
df.eq(' ').sum()

In [None]:
df = df.drop(['payment_type_id', 'internet_service_type_id', 'contract_type_id', 'customer_id'], axis = 1)
df.head(2)

In [None]:
df.shape

In [None]:
#convert 'total_charges' to float and validate change.
df['total_charges'] = df['total_charges'].astype(float)
df.dtypes

In [None]:
df = df.replace(to_replace = 'No internet service', value = 'No')
df

In [None]:
df = df.replace(to_replace = 'No phone service', value = 'No')
df

# Encode

In [None]:
#get_dummies creates a seperate df of booleans for the identified columns below. Cleaning for the decission tree.
dummy_df = pd.get_dummies(df[['dependents','phone_service','online_security','online_backup','payment_type','internet_service_type','contract_type','gender','partner','multiple_lines','device_protection','tech_support','streaming_tv','streaming_movies','paperless_billing','churn']], dummy_na=False, drop_first=[True, True])
#set 'drop_first' to 'False' to encode multiple types of the below listed columns.
dummy_df_types = pd.get_dummies(df[['payment_type','internet_service_type','contract_type',]], dummy_na=False, drop_first=False)

In [None]:
#now drop the above two columns...
df = df.drop(columns=['dependents','phone_service','online_security','online_backup','payment_type','internet_service_type','contract_type','gender','partner','multiple_lines','device_protection','tech_support','streaming_tv','streaming_movies','paperless_billing','churn'])
#...and concatanate the dummies df with the prep's df.
df = pd.concat([df, dummy_df, dummy_df_types], axis=1)
df.head()

# Split your data into train, validate, and test samples.