![HolyChurninityLogo](Trinity-overlay-overlay-overlay.jpg)

![HolyChurninityLogo2](ProfLOGO.png)

# Swan Teleco _Churn Prediction and Risk Management_

### Summary

The aims of this project are find key predictors in _Swan Teleco's_ customer churn to implement business decisions for the Customer Retention Marketing team.

Our objective aims are to:
- Find customers most at risk to churning
- Identify key data points that indicate a customers risk of churning, deducing factors to incentivise
- Quantify the churn risk for all remaining customers

In [2]:
import pandas as pd
import numpy as np

import matplotlib as plt
import seaborn as sns

In [13]:
raw_data = pd.read_excel('/Users/mylesjauncey/Documents/GitHub/The-Holy-Churnity-Father-Son-and-Departed-User-/1 - Project Data.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/mylesjauncey/Documents/GitHub/The-Holy-Churnity-Father-Son-and-Departed-User-/1 - Project Data.xlsx'

In [6]:
raw_data.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,Competitor had better devices


In [9]:
def TotalChargesEst(df):
    df['total_charges'] = pd.to_numeric(df['total_charges'].replace(' ', np.nan), errors='coerce')

    changed_indices = df[df['total_charges'].isna()].index

    calculated_total = df['monthly_charges']*df['tenure_months']
    df['total_charges'] = df['total_charges'].fillna(calculated_total)

    return df, changed_indices

In [10]:
def BoolMapping(df, cols):
    for col in cols:
        if col in ['senior_citizen', 'partner', 'dependents', 'phone_service', 'paper_billing']:
            df[col] = df[col].map({'Yes': 1, 'No': 0})
        if col == 'gender':
            df['is_male'] = df[col].map({'Male': 1, 'Female': 0})
            df.drop(columns = 'gender', inplace=True)
            cols = ['is_male' if c == 'gender' else c for c in cols]
    return df, cols

In [11]:
def Cleaning(rawdf, catcol, boolcol, stringcol, floatcol,
             dropcol):
    '''
    This function aims to clean all data as apart of the data pipeline, to prepare for modelling.
    Key points this function targets:
    -Normalising Column names
    -Fill any missing Data
    -Change types of columns
    -Drop any columns
    -Return a numeric and bool dataframe
    '''
    df = rawdf.copy()

    #Change Column Names
    df.columns = df.columns.str.strip().str.replace(' ',  '_').str.lower()

    #FillMissingData Section
    df, est_indicies = TotalChargesEst(df)
    df['churn_reason'].fillna('No Reason', inplace=True)

    #ChangeType Section

    df[catcol] = df[catcol].astype('category')
    #Bools
    df, boolcol_altered = BoolMapping(df, cols = boolcol)
    df[boolcol_altered] = df[boolcol_altered].astype('bool')
    df[stringcol] = df[stringcol].astype('string')
    df[floatcol] = df[floatcol].astype('float')

    #DropColumns
    df.drop(columns=dropcol, inplace=True)

    dfnumeric = df.copy()

    dfnumeric[boolcol_altered] = df[boolcol_altered].astype('int')

    return df, dfnumeric

In [12]:
CatCol = ['city', 'multiple_lines', 'internet_service', 'online_security', 'online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'contract', 'paperless_billing', 'payment_method', 'churn_reason']
BoolCol = ['gender', 'senior_citizen', 'partner', 'dependents', 'phone_service', 'churn_value']
StringCol = ['customerid']
FloatCol = ['total_charges']
DropCol = ['country', 'count', 'churn_label', 'lat_long', 'state']

cleandf, cleannumericdf = Cleaning(raw_data, catcol=CatCol, boolcol=BoolCol, stringcol=StringCol, floatcol=FloatCol, dropcol=DropCol)

NameError: name 'raw_data' is not defined

In [3]:
def bucketise_feature(df :pd.DataFrame, feature :str, bins :list) -> pd.DataFrame:
    df = df.copy()
    labels = [i for i in range(1, len(bins))]
    df[f'{feature}_bucketed'] = pd.cut(df[feature], bins, labels=labels)
    return df

In [14]:
def feature_eng(df):
    df = df.copy()
    # Bucketising
    df = bucketise_feature(df, 'total_charges', [x for x in range(0, 9000, 500)])
    df = bucketise_feature(df, 'monthly_charges', [x for x in range(0, 130, 20)])

    # Make id the  index
    df.set_index('customerid', inplace=True)

    # Only keep second and thrid numbers in zipcode
    df = df.zip_code.apply(lambda x: int(str(x)[1:3]))
    