In [None]:
#Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats.mstats import winsorize
import seaborn as sns

In [None]:
# Part 1
df = pd.read_csv("customer_data.csv") #Load the dataset
print (df.head(), "\n") #Inspect the first few rows of the dataset  
print (df.tail(), "\n") #Inspect the last few rows of the dataset
print (df.info(), "\n") # Check general info (like missing values, #examples, #attributes, ...)
print(df.describe().to_string())

In [None]:
#part 2: Handling missing data

# First, checking for missing values
print ("Missing Values Before Handling:\n")
print (df.isnull().sum())

Age_mean= df["Age"].mean()
df.fillna({"Age": Age_mean}, inplace=True)  #No outliers; Mean

Income_median= df["Income"].median() 
df.fillna({"Income": Income_median}, inplace=True) #Median; Income has outliers

Tenure_mean= df["Tenure"].mean() 
df.fillna({"Tenure": Tenure_mean}, inplace=True)  #No outliers; Mean

SupportCalls_median= df["SupportCalls"].median()
df.fillna({"SupportCalls": SupportCalls_median}, inplace=True)  #Median; SupportCalls

df.drop_duplicates(inplace = True) 

In [None]:
# Part 3: Handling outliers 
#A: Handling Income (Z-score Filtering, 95% of data)
Income_mean = df["Income"].mean()
Income_std = df["Income"].std()

Income_low = Income_mean - 2 * Income_std
Income_high = Income_mean + 2 * Income_std
df = df[df["Income"].between(Income_low, Income_high)]


#B: Handling SupportCalls (IQR Filteringa)
Q1 = df["SupportCalls"].quantile(0.25)
Q3 = df["SupportCalls"].quantile(0.75)
IQR = Q3 - Q1
SupportCalls_low = Q1 - 1.5 * IQR
SupportCalls_high = Q3 + 1.5 * IQR
df = df[df["SupportCalls"].between(SupportCalls_low, SupportCalls_high)]

In [None]:
## Part 4: Normalization (Z-Score)
columns = ["Age", "Tenure", "Income", "SupportCalls"] # Numerical Attributes
df[columns] = (df[columns] - df[columns].mean()) / df[columns].std() 

In [None]:
## Part 5: Exploratory Data Analysis
# A: Univariate Analysis
plt.figure()
df["Income"].plot(kind="hist", title="Income Distribution")

plt.figure()
df["Tenure"].plot(kind="hist", title="Tenure Distribution")

plt.figure()
df["SupportCalls"].plot(kind="hist", title="Support Calls Distribution")

plt.figure()
df["Age"].plot(kind="hist", title="Age Distribution")

plt.figure()
df["Gender"].value_counts().plot(kind="bar", color="orange", title="Gender")

plt.figure()
df["ProductType"].value_counts().plot(kind="bar", color="orange", title="ProductType")

plt.show()

In [None]:
## Part 5: Exploratory Data Analysis
# B: Bivariate Analysis
colors = {0: 'blue', 1: 'red'}

# Numeric features
num_features = ['Age', 'Income', 'Tenure', 'SupportCalls']

# Plot scatter for each numeric feature vs ChurnStatus
for feature in num_features:
    plt.figure(figsize=(6,4))
    plt.scatter(df[feature], df['ChurnStatus'], alpha=0.6)
    plt.title(f'{feature} vs ChurnStatus')
    plt.xlabel(feature)
    plt.ylabel('ChurnStatus')
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.show()

In [None]:
## Part 5: Exploratory Data Analysis
# C: Correlation Analysis

# Select only numerical columns
numeric_features = ['Age', 'Income', 'Tenure', 'SupportCalls', 'ChurnStatus']

# Compute correlation matrix
corr_matrix = df[numeric_features].corr()

# Display the matrix
print("Correlation matrix:")
print(corr_matrix)

# --- Plot as a heatmap (with Matplotlib only) ---
fig, ax = plt.subplots(figsize=(6, 5))
cax = ax.matshow(corr_matrix, cmap='coolwarm')

# Add colorbar
plt.colorbar(cax)

# Add ticks and labels
ax.set_xticks(range(len(numeric_features)))
ax.set_yticks(range(len(numeric_features)))
ax.set_xticklabels(numeric_features, rotation=45, ha='left')
ax.set_yticklabels(numeric_features)

plt.title("Correlation Matrix", pad=20)
plt.tight_layout()
plt.show()

In [None]:
## Part 6:
# A: Output Distribution

print("Generating and saving visualizations...")

# Churn Status Distribution as a PIE CHART
plt.figure(figsize=(6, 6))

# Count churn values (0 = stayed, 1 = churned)
churn_counts = df['ChurnStatus'].value_counts().sort_index()

# Labels for pie chart
labels = ['Stayed (0)', 'Churned (1)']

# Create the pie chart
plt.pie(
    churn_counts,
    labels=labels,
    autopct='%1.1f%%',         # show percentages
    startangle=90,             # rotate start angle
    colors=['#a6cee3', '#fb9a99'],  # pastel colors
    wedgeprops={'edgecolor': 'white'}
)

# Title and formatting
plt.title('Customer Churn Status Distribution', fontsize=16)
plt.tight_layout()

plt.show() # show

In [None]:
## Part 6:
# B: Churn Rates

# 0) Prep: clean copy and required columns
req = ["Income", "Tenure", "SupportCalls", "ChurnStatus"]
dfv = df.copy()
dfv = dfv.replace([np.inf, -np.inf], np.nan).dropna(subset=req)

# 1) Create ordered quantile bins for nicer, balanced groups
labels4 = ["Low", "Mid-Low", "Mid-High", "High"]
dfv["IncomeBin"]  = pd.qcut(dfv["Income"],  q=4, labels=labels4)
dfv["TenureBin"]  = pd.qcut(dfv["Tenure"],  q=4, labels=labels4)
dfv["SupportBin"] = pd.qcut(dfv["SupportCalls"], q=4, labels=labels4)

# 2) Build churn-rate tables (mean of ChurnStatus)
#    Income × Tenure
churn_IT = (dfv
            .groupby(["IncomeBin","TenureBin"], observed=True)["ChurnStatus"]
            .mean()
            .unstack("TenureBin")
            .reindex(index=labels4, columns=labels4))  # enforce label order

#    SupportCalls × Income
churn_SI = (dfv
            .groupby(["SupportBin","IncomeBin"], observed=True)["ChurnStatus"]
            .mean()
            .unstack("IncomeBin")
            .reindex(index=labels4, columns=labels4))

# 3) Heatmap plotting settings
cmap_custom = sns.color_palette("RdYlBu_r", as_cmap=True)  # blue=low churn, red=high churn
plots = [
    (churn_IT, "Churn Rate Heatmap: Income × Tenure", "Tenure Level", "Income Level"),
    (churn_SI, "Churn Rate Heatmap: SupportCalls × Income", "Income Level", "Support Calls Level"),
]

for tbl, title, xlabel, ylabel in plots:
    plt.figure(figsize=(8, 6))               # good readability
    ax = sns.heatmap(
        tbl,
        annot=True, fmt=".2f",
        cmap=cmap_custom,
        vmin=0, vmax=1,                      # consistent scale across both plots
        linewidths=0.6,
        square=True,                         # proportional cells
        cbar_kws={"label": "Churn Rate"}
    )

    # Flip Y so "High" is at the top
    ax.invert_yaxis()

    # Titles & labels
    plt.title(title, fontsize=14, fontweight="bold", pad=14)
    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel(ylabel, fontsize=12)

    # Tick label formatting
    plt.xticks(rotation=30, ha="right")
    plt.yticks(rotation=0)

    plt.tight_layout()
    plt.show()

In [None]:
## Part 6:
# C: Radar Visualization

# pick features already scaled in Part 4
features = ['Income', 'Tenure', 'SupportCalls']

# use existing scaled data directly (don't scale again)
df_scaled = df.copy()

# normalize for plotting (0–1 range)
plot_df = df_scaled.copy()
plot_df[features] = (df_scaled[features] - df_scaled[features].min()) / (
    df_scaled[features].max() - df_scaled[features].min()
)

# means per churn group
means = plot_df.groupby('ChurnStatus')[features].mean()
vals0, vals1 = means.loc[0].tolist(), means.loc[1].tolist()

# radar geometry
labels = features
angles = np.linspace(0, 2*np.pi, len(labels), endpoint=False).tolist()
angles += angles[:1]
v0, v1 = vals0 + vals0[:1], vals1 + vals1[:1]

# plotting
plt.close('all')
fig, ax = plt.subplots(figsize=(7,7), subplot_kw=dict(polar=True))

ax.plot(angles, v0, linewidth=2, label='Stayed (0)', color='blue')
ax.fill(angles, v0, alpha=0.25, color='blue')
ax.plot(angles, v1, linewidth=2, label='Churned (1)', color='red')
ax.fill(angles, v1, alpha=0.25, color='red')

ax.set_thetagrids(np.degrees(angles[:-1]), labels)
ax.set_ylim(0, 0.6)
ax.set_yticks(np.arange(0.1, 0.7, 0.1))
ax.set_yticklabels([f"{t:.1f}" for t in np.arange(0.1, 0.7, 0.1)])
ax.set_title('Radar Chart: Average Feature Levels by ChurnStatus')
ax.legend(loc='upper right')
plt.show()


In [None]:
## Part 6:
# D: Pair Plot showing churn pattern: low Income, low Tenure, high SupportCalls → Churn=1

# Select only relevant columns
pair_features = ["Income", "Tenure", "SupportCalls", "ChurnStatus"]

sns.pairplot(
    df[pair_features],
    hue="ChurnStatus",          # color by churn
    diag_kind="kde",            # show density on diagonal
    plot_kws={"alpha": 0.6, "s": 40},  # adjust transparency and point size
    palette="coolwarm"          # 0 = blue (Stayed), 1 = red (Churned)
)
plt.suptitle("Pairwise Relationships of Income, Tenure, and SupportCalls by ChurnStatus", y=1.02)
plt.show()
