Note: I am not sure if it is needed to have these commands:

! pip install pandas

! pip install numpy

! pip install seaborn

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

In [None]:
# Load the CSV file into a DataFrame
df = pd.read_csv('./dataset/customer_churn_dataset-training-master.csv')

Gender, Subscription Type, and Contract Length are objects so these will need to be converted to be a numerical value

In [None]:
#Check for null values
nan_count = np.sum(df.isnull(), axis=0)
nan_count

In [None]:
row_nan_count = np.sum(df.isnull(), axis=1)
row_nan_count

After looking through the dataset, row 199295 has missing values for all the columns. Therefore, we can remove this row.

In [None]:
df.iloc[199295]

In [None]:
#Drop row 199295
df.drop(199295, inplace=True)

In [None]:
#Ensure that row 199295 was removed
df.iloc[199295]

In [None]:
#Recheck for null values
nan_count = np.sum(df.isnull(), axis=0)
nan_count

In [None]:
#List the different values for the columns with object type
print(df['Gender'].unique())
print(df['Subscription Type'].unique())
print(df['Contract Length'].unique())


In [None]:
#Perform one-hot encoding
df_gender = pd.get_dummies(df['Gender'], prefix='Gender')
df = df.join(df_gender)
df.drop(columns='Gender', inplace=True)

df_subscription_type = pd.get_dummies(df['Subscription Type'], prefix='Subscription Type')
df = df.join(df_subscription_type)
df.drop(columns='Subscription Type', inplace=True)

df_contract_length = pd.get_dummies(df['Contract Length'], prefix='Contract Length')
df = df.join(df_contract_length)
df.drop(columns='Contract Length', inplace=True)

In [None]:

df.hist(bins=15, figsize=(20,15))

In [None]:
sns.lineplot(x=df['Age'], y=df['Churn'])

In [None]:
df.columns

In [None]:
#Recheck data types
df.dtypes

In [None]:
#Identifying correlations with the label
corr_matrix = round(df.corr(),5)
corrs = corr_matrix['Churn']
corrs_sorted = corrs.sort_values(ascending=False)
corrs_sorted

In [None]:
df.describe()

In [None]:
#Visualize the top two correlated features
df_sample = df.sample(n=30000)
top_two_corr = list(corrs_sorted[2:4].index)
df_corrs_sample = df_sample[top_two_corr].copy()
df_corrs_sample['Churn'] = df_sample['Churn']
sns.pairplot(data=df_corrs_sample, kind='kde', corner=True)
#ASK TA about this

In [None]:
filter_df = df[df['Age']<=25]

filter_df
filter_df.hist(column='Churn')

^Note: We had to use a sample because using the whole dataset was too large and causing no output

In [None]:
#Create a boxplot for every numerical column to identify outliers
numerical_columns = df.select_dtypes(include='float64').columns

for column in numerical_columns:
    plt.figure(figsize=(8, 2))
    sns.boxplot(x=df[column])
    plt.title(f'Boxplot of {column}')
    plt.xlabel(column)
    plt.show()

In [None]:
#Create a scatterplot to visualize the relationship between each column and the label
features = df.columns[df.columns != 'Churn']

for feature in features:
    plt.figure(figsize=(6, 2))
    sns.scatterplot(x=df[feature], y=df['Churn'])
    plt.title(f'Scatter Plot of {feature} vs Churn')
    plt.xlabel(feature)
    plt.ylabel('Churn')
    plt.show()


In [None]:
#Create a QQ Plot to determine if each numerical column has a normal distribution
numerical_columns = df.select_dtypes(include='float64').columns
numerical_columns_list = list(numerical_columns)
numerical_columns_list.remove('Churn')
numerical_columns = pd.Index(numerical_columns_list)

for feature in numerical_columns:
    plt.figure(figsize=(6, 4))
    stats.probplot(df[feature], dist="norm", plot=plt)
    plt.title(f'QQ Plot of {feature}')
    plt.xlabel('Theoretical Quantiles')
    plt.ylabel('Sample Quantiles')
    plt.grid(True)
    plt.show()


From the QQ plots, the numerical columns have less of a straight line curve and more of a cube root curve. This indicates that the columns do not have a normal distribution and will need to be transformed ?