In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sns.set(style="whitegrid")

In [3]:
X_train = pd.read_csv("data/X_train_sm.csv")
y_train = pd.read_csv("data/y_train_sm.csv")
X_test = pd.read_csv("data/X_test.csv")
y_test = pd.read_csv("data/y_test.csv")

In [6]:
# Combine features and target for easier EDA
df_train = pd.concat([X_train, y_train], axis=1)

In [5]:
# Check the structure of the dataset
print("Training Dataset Shape:", df_train.shape)
print("\nTraining Dataset Info:")
print(df_train.info())
print("\nTraining Dataset Head:")
print(df_train.head())

Training Dataset Shape: (8330, 46)

Training Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8330 entries, 0 to 8329
Data columns (total 46 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Customer ID                        8330 non-null   float64
 1   Gender                             8330 non-null   float64
 2   Age                                8330 non-null   float64
 3   Under 30                           8330 non-null   float64
 4   Senior Citizen                     8330 non-null   float64
 5   Married                            8330 non-null   float64
 6   Dependents                         8330 non-null   float64
 7   Number of Dependents               8330 non-null   float64
 8   Country                            8330 non-null   float64
 9   State                              8330 non-null   float64
 10  City                               8330 non-null   float64
 1

In [9]:
categorical_cols = [col for col in df_train.columns if df_train[col].nunique() <= 10]

print("Categorical columns based on unique values:")
print(categorical_cols)


Categorical columns based on unique values:
['Gender', 'Under 30', 'Senior Citizen', 'Married', 'Dependents', 'Number of Dependents', 'Country', 'State', 'Quarter', 'Referred a Friend', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing', 'Payment Method', 'Satisfaction Score', 'Customer Status', 'Churn']


In [11]:
if len(categorical_cols) == 0:
    print("\nNo categorical columns to summarize — dataset is fully numeric after preprocessing.")
else:
    print("\nCategorical Features Summary:")
    print(df_train[categorical_cols].describe())



Categorical Features Summary:
             Gender      Under 30  Senior Citizen       Married    Dependents  \
count  8.330000e+03  8.330000e+03    8.330000e+03  8.330000e+03  8.330000e+03   
mean   4.947356e-17  8.529925e-19    4.734108e-17  1.398908e-16 -7.506334e-17   
std    1.000060e+00  1.000060e+00    1.000060e+00  1.000060e+00  1.000060e+00   
min   -8.514663e-01 -4.098479e-01   -3.855498e-01 -7.669284e-01 -4.315921e-01   
25%   -8.514663e-01 -4.098479e-01   -3.855498e-01 -7.669284e-01 -4.315921e-01   
50%   -8.514663e-01 -4.098479e-01   -3.855498e-01 -7.669284e-01 -4.315921e-01   
75%    1.174445e+00 -4.098479e-01   -3.855498e-01  1.303903e+00 -4.315921e-01   
max    1.174445e+00  2.439929e+00    2.593699e+00  1.303903e+00  2.317003e+00   

       Number of Dependents  Country   State  Quarter  Referred a Friend  ...  \
count          8.330000e+03   8330.0  8330.0   8330.0       8.330000e+03  ...   
mean           1.108890e-17      0.0     0.0      0.0      -9.553516e-17  ...

In [13]:
df_original = pd.read_csv("data/telco.csv")
print("\nOriginal Dataset Data Types:")
print(df_original.dtypes)



Original Dataset Data Types:
Customer ID                           object
Gender                                object
Age                                    int64
Under 30                              object
Senior Citizen                        object
Married                               object
Dependents                            object
Number of Dependents                   int64
Country                               object
State                                 object
City                                  object
Zip Code                               int64
Latitude                             float64
Longitude                            float64
Population                             int64
Quarter                               object
Referred a Friend                     object
Number of Referrals                    int64
Tenure in Months                       int64
Offer                                 object
Phone Service                         object
Avg Monthly Long Distance