In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

In [None]:
# Load the CSV file into a DataFrame
training_dataset = pd.read_csv('./dataset/customer_churn_dataset-training-master.csv')
testing_dataset = pd.read_csv('./dataset/customer_churn_dataset-testing-master.csv')
# Display the first 5 rows of the DataFrame
# print(df1.head())
# print(df1.columns)
df = pd.concat([training_dataset, testing_dataset], ignore_index=True)

df.head(50)

In [None]:
df.drop(199295, inplace=True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.hist(bins=15, figsize=(20,15))

In [None]:
sns.lineplot(x=df['Age'], y=df['Churn'])

In [None]:
#Perform one-hot encoding
df_gender = pd.get_dummies(df['Gender'], prefix='Gender')
df = df.join(df_gender)
df.drop(columns='Gender', inplace=True)

df_subscription_type = pd.get_dummies(df['Subscription Type'], prefix='Subscription Type')
df = df.join(df_subscription_type)
df.drop(columns='Subscription Type', inplace=True)

df_contract_length = pd.get_dummies(df['Contract Length'], prefix='Contract Length')
df = df.join(df_contract_length)
df.drop(columns='Contract Length', inplace=True)

In [None]:
df.head()

In [None]:
def churn_rate_by_col(col:str):
    churn = df[df['Churn']==1]

    length = df[col].nunique()


    churn_count_by_col = churn.groupby(col).size().reset_index(name='Churn Count')

    
    # Create a bar plot
    plt.figure(figsize=(length, 20))
    sns.barplot(x=f'{col}', y='Churn Count', data=churn_count_by_col)

    # Add titles and labels
    plt.title(f'Number of Churned Customers by {col}')
    plt.xlabel(f'{col}')
    plt.ylabel('Number of Churns')

    # Show the plot
    plt.show()



# churn_rate_by_col("Total Spend")
churn_rate_by_col("Age")

In [None]:
df.head()

In [None]:
#continuous features relative age 

age = df['Age']
features = ["Tenure", "Usage Frequency", "Support Calls", "Payment Delays", "Total Spend", "Last Interaction"]

for col in features:
    try:
        plt.figure()  # Create a new figure
        sns.lineplot(x=age, y=df[col])
        plt.title(f'{col} vs Age') 
        plt.show()  # Display the plot
    except KeyError:
        continue

In [None]:
#Create a boxplot for every numerical column to identify outliers
numerical_columns = df.select_dtypes(include='float64').columns

for column in numerical_columns:
    plt.figure(figsize=(8, 2))
    sns.boxplot(x=df[column])
    plt.title(f'Boxplot of {column}')
    plt.xlabel(column)
    plt.show()

In [None]:
#Create a QQ Plot to determine if each numerical column has a normal distribution
numerical_columns = df.select_dtypes(include='float64').columns
numerical_columns_list = list(numerical_columns)
numerical_columns_list.remove('Churn')
numerical_columns = pd.Index(numerical_columns_list)

for feature in numerical_columns:
    plt.figure(figsize=(6, 4))
    stats.probplot(df[feature], dist="norm", plot=plt)
    plt.title(f'QQ Plot of {feature}')
    plt.xlabel('Theoretical Quantiles')
    plt.ylabel('Sample Quantiles')
    plt.grid(True)
    plt.show()


From the QQ plots, the numerical columns have less of a straight line curve and more of a cube root curve. This indicates that the columns do not have a normal distribution and will need to be transformed ?


In [None]:
#Create a scatterplot to visualize the relationship between each column and the label
features = df.columns[df.columns != 'Churn']

for feature in features:
    plt.figure(figsize=(6, 2))
    sns.scatterplot(x=df[feature], y=df['Churn'])
    plt.title(f'Scatter Plot of {feature} vs Churn')
    plt.xlabel(feature)
    plt.ylabel('Churn')
    plt.show()


In [None]:
#Spltting the concatenated data into 80/20 training and testing
from sklearn.model_selection import train_test_split
y = df['Churn']
X = df.drop(columns='Churn')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1234)

In [None]:
#Histograms of churn and no churn in the newly split training dataset
y_train_named = y_train.rename('Churn')
training_data = pd.concat([X_train, y_train_named], axis=1)

plt.figure(figsize=(6, 3))

features = training_data.drop(columns=['Churn']).columns

for feature in features:
    plt.figure(figsize=(6, 3))
    
    sns.histplot(training_data[training_data['Churn'] == 0][feature], color='blue', label='No Churn', kde=False, stat="density", bins=30, alpha=0.5)
    sns.histplot(training_data[training_data['Churn'] == 1][feature], color='red', label='Churn', kde=False, stat="density", bins=30, alpha=0.5)
    
    plt.title(f'Distribution of {feature} for Churn vs No Churn')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()
    
    plt.show()

In [None]:
#Histograms of churn and no churn in the newly split testing dataset
y_test_named = y_test.rename('Churn')
testing_data = pd.concat([X_test, y_test_named], axis=1)

plt.figure(figsize=(6, 3))

features = testing_data.drop(columns=['Churn']).columns

for feature in features:
    plt.figure(figsize=(6, 3))
    
    sns.histplot(testing_data[testing_data['Churn'] == 0][feature], color='blue', label='No Churn', kde=False, stat="density", bins=30, alpha=0.5)
    sns.histplot(testing_data[testing_data['Churn'] == 1][feature], color='red', label='Churn', kde=False, stat="density", bins=30, alpha=0.5)
    
    plt.title(f'Distribution of {feature} for Churn vs No Churn')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()
    
    plt.show()

In [None]:
y = df['Churn']
X = df.drop(columns='Churn')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y,random_state=1234)

In [None]:
#Checking imbalance: 
y_train.value_counts(normalize=True)



In [None]:
#Histograms of churn and no churn in the newly split training dataset
y_train_named = y_train.rename('Churn')
training_data = pd.concat([X_train, y_train_named], axis=1)

plt.figure(figsize=(6, 3))

features = training_data.drop(columns=['Churn']).columns

for feature in features:
    plt.figure(figsize=(6, 3))
    
    sns.histplot(training_data[training_data['Churn'] == 0][feature], color='blue', label='No Churn', kde=False, stat="density", bins=30, alpha=0.5)
    sns.histplot(training_data[training_data['Churn'] == 1][feature], color='red', label='Churn', kde=False, stat="density", bins=30, alpha=0.5)
    
    plt.title(f'Distribution of {feature} for Churn vs No Churn')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()
    
    plt.show()
    

In [None]:
#Histograms of churn and no churn in the newly split testing dataset
y_test_named = y_test.rename('Churn')
testing_data = pd.concat([X_test, y_test_named], axis=1)

plt.figure(figsize=(6, 3))

features = testing_data.drop(columns=['Churn']).columns

for feature in features:
    plt.figure(figsize=(6, 3))
    
    sns.histplot(testing_data[testing_data['Churn'] == 0][feature], color='blue', label='No Churn', kde=False, stat="density", bins=30, alpha=0.5)
    sns.histplot(testing_data[testing_data['Churn'] == 1][feature], color='red', label='Churn', kde=False, stat="density", bins=30, alpha=0.5)
    
    plt.title(f'Distribution of {feature} for Churn vs No Churn')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()
    
    plt.show()