In [None]:
# Import libraries for data manipulation
import pandas as pd



# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)

# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

# To encode categorical variables
from sklearn.preprocessing import LabelEncoder

# To scale numerical varaibles
from sklearn.preprocessing import StandardScaler

In [3]:
# Load Data
data = pd.read_csv('/workspaces/Predicting-Potential-Customers/ExtraaLearn.csv')

# Overview
data

Unnamed: 0,ID,age,current_occupation,first_interaction,profile_completed,website_visits,time_spent_on_website,page_views_per_visit,last_activity,print_media_type1,print_media_type2,digital_media,educational_channels,referral,status
0,EXT001,57,Unemployed,Website,High,7,1639,1.861,Website Activity,Yes,No,Yes,No,No,1
1,EXT002,56,Professional,Mobile App,Medium,2,83,0.320,Website Activity,No,No,No,Yes,No,0
2,EXT003,52,Professional,Website,Medium,3,330,0.074,Website Activity,No,No,Yes,No,No,0
3,EXT004,53,Unemployed,Website,High,4,464,2.057,Website Activity,No,No,No,No,No,1
4,EXT005,23,Student,Website,High,4,600,16.914,Email Activity,No,No,No,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4607,EXT4608,35,Unemployed,Mobile App,Medium,15,360,2.170,Phone Activity,No,No,No,Yes,No,0
4608,EXT4609,55,Professional,Mobile App,Medium,8,2327,5.393,Email Activity,No,No,No,No,No,0
4609,EXT4610,58,Professional,Website,High,2,212,2.692,Email Activity,No,No,No,No,No,1
4610,EXT4611,57,Professional,Mobile App,Medium,1,154,3.879,Website Activity,Yes,No,No,No,No,0


#### **Dropping Insignificant Variables**

In [5]:
print("Original columns:", data.columns)

Original columns: Index(['ID', 'age', 'current_occupation', 'first_interaction',
       'profile_completed', 'website_visits', 'time_spent_on_website',
       'page_views_per_visit', 'last_activity', 'print_media_type1',
       'print_media_type2', 'digital_media', 'educational_channels',
       'referral', 'status'],
      dtype='object')


In [6]:
# Drop 'ID' column, as it's typically not useful for modeling
if 'ID' in data.columns:
    data = data.drop('ID', axis=1)
    print("Dropped 'ID' column as it's not relevant for modeling.")

Dropped 'ID' column as it's not relevant for modeling.


In [7]:
# Based on the correlation matrix, we might consider dropping 'page_views_per_visit'
# as it had very low correlation with the target variable
data = data.drop('page_views_per_visit', axis=1)
print("Dropped 'page_views_per_visit' due to very low correlation with the target.")

Dropped 'page_views_per_visit' due to very low correlation with the target.


In [8]:
print("Remaining columns:", data.columns)

Remaining columns: Index(['age', 'current_occupation', 'first_interaction', 'profile_completed',
       'website_visits', 'time_spent_on_website', 'last_activity',
       'print_media_type1', 'print_media_type2', 'digital_media',
       'educational_channels', 'referral', 'status'],
      dtype='object')


In [9]:
# Convert categorical variables to dummy variables
data = pd.get_dummies(data, columns=['current_occupation', 'first_interaction', 'last_activity'], drop_first=True)
print("Converted categorical variables to dummy variables.")

Converted categorical variables to dummy variables.


In [10]:
# Create age groups
data['age_group'] = pd.cut(data['age'], bins=[0, 25, 35, 45, 55, 100], labels=['18-25', '26-35', '36-45', '46-55', '55+'])
data = pd.get_dummies(data, columns=['age_group'], drop_first=True)
print("Created age group dummy variables.")

Created age group dummy variables.


In [11]:
# For 'profile_completed', we'll create two binary columns for 'High' and 'Medium'
data['profile_completed_high'] = (data['profile_completed'] == 'High').astype(int)
data['profile_completed_medium'] = (data['profile_completed'] == 'Medium').astype(int)
data = data.drop('profile_completed', axis=1)
print("Created binary features for profile completion levels.")

Created binary features for profile completion levels.


In [14]:
binary_cols = ['print_media_type1', 'print_media_type2', 'digital_media', 'educational_channels', 'referral']

In [15]:
for var in binary_cols:
    data[var] = data[var].map({'Yes': 1, 'No': 0})
    print(f"Converted {var} to binary (1/0)")

Converted print_media_type1 to binary (1/0)
Converted print_media_type2 to binary (1/0)
Converted digital_media to binary (1/0)
Converted educational_channels to binary (1/0)
Converted referral to binary (1/0)


In [16]:
# See changes
data.head()

Unnamed: 0,age,website_visits,time_spent_on_website,print_media_type1,print_media_type2,digital_media,educational_channels,referral,status,current_occupation_Student,current_occupation_Unemployed,first_interaction_Website,last_activity_Phone Activity,last_activity_Website Activity,age_group_26-35,age_group_36-45,age_group_46-55,age_group_55+,profile_completed_high,profile_completed_medium
0,57,7,1639,1,0,1,0,0,1,False,True,True,False,True,False,False,False,True,1,0
1,56,2,83,0,0,0,1,0,0,False,False,False,False,True,False,False,False,True,0,1
2,52,3,330,0,0,1,0,0,0,False,False,True,False,True,False,False,True,False,0,1
3,53,4,464,0,0,0,0,0,1,False,True,True,False,True,False,False,True,False,1,0
4,23,4,600,0,0,0,0,0,0,True,False,True,False,False,False,False,False,False,1,0


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4612 entries, 0 to 4611
Data columns (total 20 columns):
 #   Column                          Non-Null Count  Dtype
---  ------                          --------------  -----
 0   age                             4612 non-null   int64
 1   website_visits                  4612 non-null   int64
 2   time_spent_on_website           4612 non-null   int64
 3   print_media_type1               4612 non-null   int64
 4   print_media_type2               4612 non-null   int64
 5   digital_media                   4612 non-null   int64
 6   educational_channels            4612 non-null   int64
 7   referral                        4612 non-null   int64
 8   status                          4612 non-null   int64
 9   current_occupation_Student      4612 non-null   bool 
 10  current_occupation_Unemployed   4612 non-null   bool 
 11  first_interaction_Website       4612 non-null   bool 
 12  last_activity_Phone Activity    4612 non-null   bool 
 13  las

In [18]:
data.to_csv('preprocessed_leads.csv', index=False)