In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Changes
alpha = 0.05
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [2]:
#Importing the data from GitHub
churn_raw  = pd.read_csv("https://raw.githubusercontent.com/johnson80245/python/refs/heads/main/churn_raw_data.csv")
churn = churn_raw.copy()

In [3]:
#Renaming the columns to correct survey response
churn = churn.rename( columns = {
    "CaseOrder" : "case_order",
    "TimeZone" : "time_zone",
    "InternetService" : "internet_service",
    "OnlineSecurity" : "online_security",
    "OnlineBackup" : "online_backup",
    "DeviceProtection" : "device_protections",
    "TechSupport" : "tech_support", 
    "StreamingTV" : "streaming_tv", 
    "StreamingMovies" : "streaming_movies",
    "PaperlessBilling" : "paperless_billing",
    "PaymentMethod" : "payment_method",
    "MonthlyCharge" : "monthly_charge",
    "item1" : "timely_response",
    "item2" : "timely_fixes",
    'item3' : "timely_replacements",
    'item4' : "reliability",
    "item5" : "options",
    "item6" : "respectful_responses",
    "item7" : "courteous_exchange",
    "item8" : "evidence_of_active_listening"
})

In [4]:
#Cleaning column names:
def to_clean(val):
    return val.strip().lower().replace(" ","_")

churn = churn.rename(columns = to_clean)

In [5]:
#Changing mapping of true/false  & yes/no

bool_map = {'False': 0, 'True': 1, 'Yes':1,'No':0}
churn_map = {'Yes':1,'No':0}
bool_cols = ['techie', 'port_modem', 'tablet', 'phone', 'multiple',
           'online_security', 'online_backup', 'device_protections',
           'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing']

for column in bool_cols:
    churn[bool_cols] = churn[bool_cols].replace(bool_map)

churn[bool_cols] = churn[bool_cols].replace(bool_map)
churn['churn'] = churn['churn'].replace(churn_map)



  churn[bool_cols] = churn[bool_cols].replace(bool_map)
  churn['churn'] = churn['churn'].replace(churn_map)


In [6]:
# How to handle outliers:

def fix_outliers_iqr(df, cols, method='cap'):
    for col in cols:
        #Setting IQR 
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        #Method types and how to handle outliers

        #Remove outliers
        if method == 'remove':
            df = df[(df[col] >= lower) & (df[col] <= upper)]
        #Cap at lower and upper IQE
        elif method == 'cap':
            df[col] = np.where(df[col] < lower, lower,
                        np.where(df[col] > upper, upper, df[col]))
        #Use the median to replace those outside the upper/lower
        elif method == 'median':
            median = df[col].median()
            df[col] = np.where((df[col] < lower) | (df[col] > upper), median, df[col])
        #Flag outliers for look 
        elif method == 'flag':
            df[f'{col}_outlier'] = ((df[col] < lower) | (df[col] > upper)).astype(int)

    return df



outlier_col = ['income','tenure','monthly_charge','bandwidth_gb_year']

churn = fix_outliers_iqr(churn,outlier_col,method = 'cap')


In [7]:
#Discovering missing values:
churn.isnull().sum()
#Missing values in:
    # -Children, Age, Income, techie, internet service, 
    # phone, tech support, tenure, bandwidth

#How to handle:
#Income - Look at state, age, segments
#Children - Look at age group segments if older than x no kids if in range of y-z look at segments
    # Could also look at states children 
#Internet Service - if they stream tv or movies then true else false
#Techie - Look at age and if they have internet
#internet service - look at most common, or look at by state/population
# Phone - Look at frequency and apply across the missing columns
# Tech Support - Look at internet, Look at frequency and apply across the missing columns
# Tenure - Look at services bandwidth etc. 
# bandwidth - Look at service 

#Replacement Income

#Handle outliers
#Handle Missing Age values first. 
#Then create segments on age 
#Then convert to categorical 


#Age missing values:
# Using the median values
churn['age'] = churn.age.fillna(churn.age.median())


#Create age group segments to apply to churn 
def segment_age_group(age):
    if pd.isna(age):
        return 'Unknown'
    if age < 18:
        return 'Under 18'
    if age <= 29:
        return '18-29'
    start = (age // 10) * 10
    end = start + 9
    return f"{int(start)}-{int(end)}"

churn['age_group'] = churn['age'].apply(segment_age_group)

missing_values = churn[[
                'age_group',
                'children' , 
                'income' , 
                'techie' , 
                'internet_service' , 
                'phone',
                'tech_support',
                'tenure', 
                'bandwidth_gb_year'
                ]]


#Replacing missing income with median 
income_med = missing_values.groupby(['age_group'])['income'].agg('median')
churn['income'] = churn['income'].fillna(churn['age_group'].map(income_med))

#Replace Children with median
child_med = missing_values.groupby(['age_group'])['children'].agg('median')
churn['children'] = churn['children'].fillna(churn['age_group'].map(child_med))

# techie 
techie_med = missing_values.groupby(['age_group'])['techie'].agg('median')
churn['techie'] = churn['techie'].fillna(churn['age_group'].map(techie_med))

#Internet_service
internet_mode = missing_values.groupby('age_group')['internet_service'].agg(lambda x: x.mode().iloc[0])
churn['internet_service'] = churn['internet_service'].fillna(churn['age_group'].map(internet_mode))

#Phone
phone_med = missing_values.groupby(['age_group'])['phone'].agg('median')
churn['phone'] = churn['phone'].fillna(churn['age_group'].map(phone_med))

#Tech Support
tech_med = missing_values.groupby(['age_group'])['tech_support'].agg('median')
churn['tech_support'] = churn['tech_support'].fillna(churn['age_group'].map(tech_med))

#Tenure -Median
tenure_med = missing_values.groupby(['age_group'])['tenure'].agg('median')
churn['tenure'] = churn['tenure'].fillna(churn['age_group'].map(tenure_med))

#bandwidth_gb_year
band_med = missing_values.groupby(['age_group'])['bandwidth_gb_year'].agg('median')
churn['bandwidth_gb_year'] = churn['bandwidth_gb_year'].fillna(churn['age_group'].map(band_med))


In [8]:
def univar(df,val):
  

    print("The Information Below is For:", val)
    
    plt.hist(df[val])
    plt.show()
    
    red_circle = dict(marker='o', markerfacecolor='red', markersize=6)
    mean_shape = dict(marker='D', markerfacecolor='blue', markersize=6)

    df[val].plot(kind='box',
                vert=False,
                flierprops=red_circle,
                showmeans=True,
                meanprops=mean_shape,
                notch=True)
    plt.show()
    
    print(df[val].describe().round(3))

    print("-------------------------------")
    print("The mode is:",df[val].mode()[0])
    
    print("____________________________________________")

In [9]:
#Cast as categorical data

cat_cols = ['case_order', 'customer_id', 'interaction', 'city', 'state', 'county',
            'area', 'timezone', 'job', 'education', 'employment',
            'marital', 'gender', 'churn', 'techie', 'contract',
            'port_modem', 'tablet', 'internet_service', 'phone', 'multiple',
            'online_security', 'online_backup', 'device_protections', 'tech_support', 'streaming_tv',
            'streaming_movies', 'paperless_billing', 'payment_method', 'timely_response', 'timely_fixes',
            'timely_replacements', 'reliability', 'options', 'respectful_responses',
            'courteous_exchange', 'evidence_of_active_listening']


for column in cat_cols:
    churn[column] = churn[column].astype('category')

In [11]:
#Describe all transformed
churn.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
zip,10000.0,49153.3196,27532.196108,601.0,26292.5,48869.5,71866.5,99929.0
lat,10000.0,38.757567,5.437389,17.96612,35.341828,39.3958,42.106908,70.64066
lng,10000.0,-90.782536,15.156142,-171.68815,-97.082812,-87.9188,-80.088745,-65.66785
population,10000.0,9756.5624,14432.698671,0.0,738.0,2910.5,13168.0,111850.0
children,10000.0,1.9065,1.910006,0.0,1.0,1.0,3.0,10.0
age,10000.0,53.2075,18.003457,18.0,41.0,53.0,65.0,89.0
income,10000.0,37668.877636,22500.198921,740.66,23660.79,32658.645,45504.1925,104752.70375
outage_sec_perweek,10000.0,11.452955,7.025921,-1.348571,8.054362,10.202896,12.487644,47.04928
email,10000.0,12.016,3.025898,1.0,10.0,12.0,14.0,23.0
contacts,10000.0,0.9942,0.988466,0.0,0.0,1.0,2.0,7.0


In [None]:
#Fileds going to use:

churn = churn[['customer_id', 'state','area','children','age',
               'employment', 'income','marital','gender','churn',
               'outage_sec_perweek','yearly_equip_failure','contract','internet_service',
               'tech_support','streaming_tv','streaming_movies','tenure',
               'monthly_charge','bandwidth_gb_year', 'timely_response', 'timely_fixes',
               'timely_replacements','reliability','options','respectful_responses',
               'courteous_exchange','evidence_of_active_listening','age_group']]


In [41]:
area_failure = churn.groupby(['state','churn'])[['monthly_charge']].agg('mean')
area_failure.head(10)

  area_failure = churn.groupby(['state','churn'])[['monthly_charge']].agg('mean')


Unnamed: 0_level_0,Unnamed: 1_level_0,monthly_charge
state,churn,Unnamed: 2_level_1
AK,0,166.663482
AK,1,200.023618
AL,0,167.768572
AL,1,188.217939
AR,0,170.277396
AR,1,201.926013
AZ,0,159.562344
AZ,1,206.874542
CA,0,162.38294
CA,1,195.092532
