In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
#Reading the dataset
df = pd.read_csv('online_retail_customer_churn.csv')
df.head()

Unnamed: 0,Customer_ID,Age,Gender,Annual_Income,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Email_Opt_In,Promotion_Response,Target_Churn
0,1,62,Other,45.15,5892.58,5,22,453.8,2,0,3,129,True,Responded,True
1,2,65,Male,79.51,9025.47,13,77,22.9,2,2,3,227,False,Responded,False
2,3,18,Male,29.19,618.83,13,71,50.53,5,2,2,283,False,Responded,True
3,4,21,Other,79.63,9110.3,3,33,411.83,5,3,5,226,True,Ignored,True
4,5,21,Other,77.66,5390.88,15,43,101.19,3,0,5,242,False,Unsubscribed,False


In [3]:
#Getting more info from the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Customer_ID                 1000 non-null   int64  
 1   Age                         1000 non-null   int64  
 2   Gender                      1000 non-null   object 
 3   Annual_Income               1000 non-null   float64
 4   Total_Spend                 1000 non-null   float64
 5   Years_as_Customer           1000 non-null   int64  
 6   Num_of_Purchases            1000 non-null   int64  
 7   Average_Transaction_Amount  1000 non-null   float64
 8   Num_of_Returns              1000 non-null   int64  
 9   Num_of_Support_Contacts     1000 non-null   int64  
 10  Satisfaction_Score          1000 non-null   int64  
 11  Last_Purchase_Days_Ago      1000 non-null   int64  
 12  Email_Opt_In                1000 non-null   bool   
 13  Promotion_Response          1000 n

In [4]:
#Cleaning missing and null values - There are no missing values
df.isnull().sum()

Customer_ID                   0
Age                           0
Gender                        0
Annual_Income                 0
Total_Spend                   0
Years_as_Customer             0
Num_of_Purchases              0
Average_Transaction_Amount    0
Num_of_Returns                0
Num_of_Support_Contacts       0
Satisfaction_Score            0
Last_Purchase_Days_Ago        0
Email_Opt_In                  0
Promotion_Response            0
Target_Churn                  0
dtype: int64

In [5]:
# Check for duplicate rows
num_duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows found: {num_duplicates}")

if num_duplicates > 0:
    # Remove duplicate rows
    df.drop_duplicates(inplace=True)
    print(f"Duplicate rows removed. New shape: {df.shape}")
else:
    print("No duplicate rows found.")


Number of duplicate rows found: 0
No duplicate rows found.


In [6]:
#Data type conversion - converting boolean columns (Target_Churn & Email_Opt_In) to integers (0 or 1)
df['Email_Opt_In'] = df['Email_Opt_In'].astype(int)
df['Target_Churn'] = df['Target_Churn'].astype(int)
print("\nConverted 'Email_Opt_In' and 'Target_Churn' to integer type.")

print("\nData types after boolean conversion:")
print(df[['Email_Opt_In', 'Target_Churn']].head())


Converted 'Email_Opt_In' and 'Target_Churn' to integer type.

Data types after boolean conversion:
   Email_Opt_In  Target_Churn
0             1             1
1             0             0
2             0             1
3             1             1
4             0             0


In [7]:
# Categorical Encoding
from sklearn.preprocessing import OneHotEncoder
# Identify categorical columns for one-hot encoding
categorical_cols = ['Gender', 'Promotion_Response']

# Initialize OneHotEncoder
# handle_unknown='ignore' prevents errors during transformation if a category not seen during fit is encountered
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit and transform the categorical columns
encoded_features = encoder.fit_transform(df[categorical_cols])

# Create a DataFrame with the encoded features
encoded_feature_names = encoder.get_feature_names_out(categorical_cols)
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names, index=df.index)

# Drop the original categorical columns and concatenate the encoded ones
df = df.drop(columns=categorical_cols)
df = pd.concat([df, encoded_df], axis=1)

print("\nDataset after One-Hot Encoding:")
print(df.head())
print(f"New shape after encoding: {df.shape}")


Dataset after One-Hot Encoding:
   Customer_ID  Age  Annual_Income  Total_Spend  Years_as_Customer  \
0            1   62          45.15      5892.58                  5   
1            2   65          79.51      9025.47                 13   
2            3   18          29.19       618.83                 13   
3            4   21          79.63      9110.30                  3   
4            5   21          77.66      5390.88                 15   

   Num_of_Purchases  Average_Transaction_Amount  Num_of_Returns  \
0                22                      453.80               2   
1                77                       22.90               2   
2                71                       50.53               5   
3                33                      411.83               5   
4                43                      101.19               3   

   Num_of_Support_Contacts  Satisfaction_Score  Last_Purchase_Days_Ago  \
0                        0                   3                     12

In [16]:
# Feature Scaling (Normalization/Standardization)
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Identify numerical columns for scaling (exclude 'Customer_ID' if it's just an identifier)
numerical_cols_for_scaling = [
    'Age', 'Annual_Income', 'Total_Spend', 'Years_as_Customer',
    'Num_of_Purchases', 'Average_Transaction_Amount', 'Num_of_Returns',
    'Num_of_Support_Contacts', 'Satisfaction_Score', 'Last_Purchase_Days_Ago'
]

# --- Option 1: Standardization (StandardScaler) ---
scaler_standard = StandardScaler()
df_standardized = df.copy() # Create a copy to store standardized data
df_standardized[numerical_cols_for_scaling] = scaler_standard.fit_transform(df[numerical_cols_for_scaling])

print("\nDataset after Standardization (first 5 rows of scaled numerical columns):")
print(df_standardized[numerical_cols_for_scaling].head())

# --- Option 2: Normalization (MinMaxScaler) ---
scaler_minmax = MinMaxScaler()
df_normalized = df.copy() # Create another copy for normalized data
df_normalized[numerical_cols_for_scaling] = scaler_minmax.fit_transform(df[numerical_cols_for_scaling])

print("\nDataset after Normalization (first 5 rows of scaled numerical columns):")
print(df_normalized[numerical_cols_for_scaling].head())

# Choose one of the scaled DataFrames for further analysis (e.g., df_standardized or df_normalized)
# For example, let's continue with the standardized dataframe:
# df_final = df_standardized



Dataset after Standardization (first 5 rows of scaled numerical columns):
        Age  Annual_Income  Total_Spend  Years_as_Customer  Num_of_Purchases  \
0  1.229628      -1.264973     0.283773          -0.854240         -0.962378   
1  1.426547      -0.614434     1.378924           0.591480          0.965463   
2 -1.658518      -1.567145    -1.559749           0.591480          0.755153   
3 -1.461599      -0.612162     1.408578          -1.215669         -0.576810   
4 -1.461599      -0.649460     0.108396           0.952910         -0.226294   

   Average_Transaction_Amount  Num_of_Returns  Num_of_Support_Contacts  \
0                    1.282050       -0.902114                -1.379444   
1                   -1.673359       -0.902114                 0.047075   
2                   -1.483853        0.134005                 0.047075   
3                    0.994191        0.134005                 0.760335   
4                   -1.136392       -0.556741                -1.379444   


In [18]:
#Reviewing the cleaned data set
df.head()

Unnamed: 0,Customer_ID,Age,Annual_Income,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Email_Opt_In,Target_Churn,Gender_Female,Gender_Male,Gender_Other,Promotion_Response_Ignored,Promotion_Response_Responded,Promotion_Response_Unsubscribed
0,1,62,45.15,5892.58,5,22,453.8,2,0,3,129,1,1,0.0,0.0,1.0,0.0,1.0,0.0
1,2,65,79.51,9025.47,13,77,22.9,2,2,3,227,0,0,0.0,1.0,0.0,0.0,1.0,0.0
2,3,18,29.19,618.83,13,71,50.53,5,2,2,283,0,1,0.0,1.0,0.0,0.0,1.0,0.0
3,4,21,79.63,9110.3,3,33,411.83,5,3,5,226,1,1,0.0,0.0,1.0,1.0,0.0,0.0
4,5,21,77.66,5390.88,15,43,101.19,3,0,5,242,0,0,0.0,0.0,1.0,0.0,0.0,1.0


In [20]:
#Reviewing the additional columns that were added
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Customer_ID                      1000 non-null   int64  
 1   Age                              1000 non-null   int64  
 2   Annual_Income                    1000 non-null   float64
 3   Total_Spend                      1000 non-null   float64
 4   Years_as_Customer                1000 non-null   int64  
 5   Num_of_Purchases                 1000 non-null   int64  
 6   Average_Transaction_Amount       1000 non-null   float64
 7   Num_of_Returns                   1000 non-null   int64  
 8   Num_of_Support_Contacts          1000 non-null   int64  
 9   Satisfaction_Score               1000 non-null   int64  
 10  Last_Purchase_Days_Ago           1000 non-null   int64  
 11  Email_Opt_In                     1000 non-null   int64  
 12  Target_Churn         

In [22]:
#Saving the cleaned dataframe
output_file_path = 'online_retail_customer_churn_cleaned.csv'
df.to_csv(output_file_path, index=False)

print(f"\nCleaned data saved successfully to '{output_file_path}'")


Cleaned data saved successfully to 'online_retail_customer_churn_cleaned.csv'
