In [1]:
#libraries
import acquire
import explore
import prepare
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Planning

### Project description and Goals

### Working through the pipeline

### Data dictionary

m2m = month-to-month

### Starting hypothesis

# Data Acquisition

### SQL code used to acquire data from aquire.py

            SELECT *\
            FROM customers\
            RIGHT JOIN contract_types USING(contract_type_id)\
            RIGHT JOIN internet_service_types USING(internet_service_type_id)\
            RIGHT JOIN payment_types USING(payment_type_id);"

### Notes:

    - Data acquired from acquire.y module
    - Data summarization
    - Plot distributions

In [2]:
#acquire telco data
df = acquire.get_telco_data()
df.head(2)

Unnamed: 0,payment_type_id,payment_type,internet_service_type_id,internet_service_type,contract_type_id,contract_type,customer_id,gender,senior_citizen,partner,...,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn
0,1,Electronic check,1,DSL,1,Month-to-month,0015-UOCOJ,Female,1,No,...,Yes,No,No,No,No,No,Yes,48.2,340.35,No
1,1,Electronic check,1,DSL,1,Month-to-month,0023-HGHWL,Male,1,No,...,No,No,No,No,No,No,Yes,25.1,25.1,Yes


# Data preparation

### Data prepared using the prepare.py module. The prepare module contains the following function:
    - Splits data for train, validate, test
    - Handles missing values
    - Handles erroneous and outlier data
    - Encodes necessary variables
    - Creates necessary features

### Data prep notes:
    - Explore missing values and document takeaways/action plans
    - Explore data types and adapt types or data values as needed 
    - Create necessary new features
    - Prep imported from prepare.py

In [3]:
#clean the data
df = prepare.clean_data(df)
df.head(1)

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,bank_transfer,credit_card,e_check,check,dsl,fiber,...,two_year_contract,is_male,partner,multiple_lines,device_protection,tech_support,streaming_tv,streaming_movies,paperless,churned
0,1,7,48.2,340.35,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
#split the date
train, validate, test = prepare.train_validate_test_split(df, 'churned')

In [6]:
#validate the split
print(train.shape, validate.shape, test.shape)

(3943, 34) (1691, 34) (1409, 34)


In [7]:
# create X & y version of train, where y is a series with just the target variable and X are all the features.
X_train = train.drop(columns=["churned"])
y_train = train.churned

X_validate = validate.drop(columns=["churned"])
y_validate = validate.churned

X_test = test.drop(columns=["churned"])
y_test = test.churned

# Exploration & Analysis

### Analysis notes:
    - Answers and document initial hypothesis using a minimum of two statistical tests.
    - Visulizations
    - Conclusion and takeaway summary

### Exploration:
    - If a group is identified by tenure, is there a cohort or cohorts who have a higher rate of churn than other cohorts?
    - Are there features that indicate a higher propensity to churn
    - Is there a price threshold for specific services where the likelihood of churn increases once price for those services goes past that point? If so, what is that point for what service(s)?
    - If we looked at churn rate for month-to-month customers after the 12th month and that of 1-year contract customers after the 12th month, are those rates comparable?
    - Controlling for services (phone_id, internet_service_type_id, online_security_backup, device_protection, tech_support, and contract_type_id), is the mean monthly_charges of those who have churned significantly different from that of those who have not churned? (Use a t-test to answer this.)
    - How much of monthly_charges can be explained by internet_service_type?
    - How much of monthly_charges can be explained by internet_service_type + phone_service_type (0, 1, or multiple lines).

# Modeling and Evaluation

### Notes:
    - Document baseline accuracy using three different models.
    - Train (fit, transform, evaluate) multiple models, varying the algorithm and/or hyperparameters 
    - Compare evaluation metrics across all the models
    - (Optional) Remove variables that provide limited to no additionl information
    - Best model used with to test data.
    - Final test model of out-of-sample data (the testing dataset)
    - performance summarization, interpret, and document

# Delivery

    - Summary of findings
    - Analysis
    - Conclusion including takeaways and recommendations