In [1370]:
import warnings
warnings.filterwarnings('ignore')

<h1>Step 1: Data Collection</h1>

* Load the Dataset

In [1371]:
import pandas as pd
import numpy as np

# Read the leads.csv file
df = pd.read_csv('Files/leads.csv')


* Explore the Dataset

In [1372]:
# Display the first few rows of the dataset
print(df.head())

# Check the number of rows and columns in the dataset
print("Shape of the dataset:", df.shape)

# Review the column names
print("Column names:", df.columns)

# Get summary information about the dataset
print(df.info())

# Calculate descriptive statistics for numeric variables
print(df.describe())


                            Prospect ID  Lead Number              Lead Origin  \
0  7927b2df-8bba-4d29-b9a2-b6e0beafe620       660737                      API   
1  2a272436-5132-4136-86fa-dcc88c88f482       660728                      API   
2  8cc8c611-a219-4f35-ad23-fdfd2656bd8a       660727  Landing Page Submission   
3  0cc2df48-7cf4-4e39-9de9-19797f9b38cc       660719  Landing Page Submission   
4  3256f628-e534-4826-9d63-4a8b88782852       660681  Landing Page Submission   

      Lead Source Do Not Email Do Not Call  Converted  TotalVisits  \
0      Olark Chat           No          No          0          0.0   
1  Organic Search           No          No          0          5.0   
2  Direct Traffic           No          No          1          2.0   
3  Direct Traffic           No          No          0          1.0   
4          Google           No          No          1          2.0   

   Total Time Spent on Website  Page Views Per Visit  ...  \
0                            0 

* Identify the Target Variable

In [1373]:
# "Converted" is the target variable
target_variable = 'Converted'

* Review Data Types

In [1374]:
# Check the data types of each variable
print(df.dtypes)

Prospect ID                                       object
Lead Number                                        int64
Lead Origin                                       object
Lead Source                                       object
Do Not Email                                      object
Do Not Call                                       object
Converted                                          int64
TotalVisits                                      float64
Total Time Spent on Website                        int64
Page Views Per Visit                             float64
Last Activity                                     object
Country                                           object
Specialization                                    object
How did you hear about X Education                object
What is your current occupation                   object
What matters most to you in choosing a course     object
Search                                            object
Magazine                       

* Assess Data Quality

In [1375]:
# Check for missing values
print(df.isnull().sum())

# Check for duplicates
print("Number of duplicates:", df.duplicated().sum())


Prospect ID                                         0
Lead Number                                         0
Lead Origin                                         0
Lead Source                                        36
Do Not Email                                        0
Do Not Call                                         0
Converted                                           0
TotalVisits                                       137
Total Time Spent on Website                         0
Page Views Per Visit                              137
Last Activity                                     103
Country                                          2461
Specialization                                   1438
How did you hear about X Education               2207
What is your current occupation                  2690
What matters most to you in choosing a course    2709
Search                                              0
Magazine                                            0
Newspaper Article           

<h1>Step 2 : Data Preprocessing</h1>


* Data Cleaning

identify columns with missing values

In [1376]:
print(df.isnull().sum())

Prospect ID                                         0
Lead Number                                         0
Lead Origin                                         0
Lead Source                                        36
Do Not Email                                        0
Do Not Call                                         0
Converted                                           0
TotalVisits                                       137
Total Time Spent on Website                         0
Page Views Per Visit                              137
Last Activity                                     103
Country                                          2461
Specialization                                   1438
How did you hear about X Education               2207
What is your current occupation                  2690
What matters most to you in choosing a course    2709
Search                                              0
Magazine                                            0
Newspaper Article           

Based on the missing value counts, we have several columns with missing values. To decide how to handle these missing values, it's important to understand the nature of the data and the context of each column.
Some common strategies for handling missing values include:
* Dropping columns with a high percentage of missing values.
* Dropping rows with missing values, especially if the number of missing values is relatively small compared to the total dataset size.
* Imputing missing values using methods such as mean, median, mode, or regression.

To drop columns with a high percentage of missing values, we can set a threshold value and drop the columns that have missing values exceeding that threshold. below code  that drops columns with missing values exceeding a threshold of 30%

In [1377]:
threshold = 0.3  # Set the threshold to 30% missing values

# Calculate the percentage of missing values in each column
missing_percentage = df.isnull().mean()

# Get the columns to drop based on the threshold
columns_to_drop = missing_percentage[missing_percentage > threshold].index

# Drop the columns from the DataFrame
df_dropped_columns = df.drop(columns=columns_to_drop)

# Print the updated DataFrame
print(df_dropped_columns.head())


In [None]:
# check for percentage of null values in each column after dropping columns having more than 30 % null values

round(100*(df_dropped_columns.isnull().sum()/len(df_dropped_columns.index)), 2)

Prospect ID                                       0.00
Lead Number                                       0.00
Lead Origin                                       0.00
Lead Source                                       0.39
Do Not Email                                      0.00
Do Not Call                                       0.00
Converted                                         0.00
TotalVisits                                       1.48
Total Time Spent on Website                       0.00
Page Views Per Visit                              1.48
Last Activity                                     1.11
Country                                          26.63
Specialization                                   15.56
How did you hear about X Education               23.89
What is your current occupation                  29.11
What matters most to you in choosing a course    29.32
Search                                            0.00
Magazine                                          0.00
Newspaper 

The below columns still has high null values let's individually check and handle them
* Country   
* Specialization        
* How did you hear about X Education           
* What is your current occupation       
* What matters most to you in choosing a course    
* Lead Profile   
* City  

* Country: To impute missing values in the 'Country' column, we can replace them with the mode (most frequent value) since it's a categorical variable.

In [None]:
df_dropped_columns['Country'] = df_dropped_columns['Country'].fillna(df_dropped_columns['Country'].mode()[0])


* Specialization:
For the 'Specialization' column, we can replace missing values with the string "Not Specified" to indicate that the information was not provided.


In [None]:
df_dropped_columns['Specialization'] = df_dropped_columns['Specialization'].fillna('Not Specified')

* How did you hear about X Education:
Similarly, for the 'How did you hear about X Education' column, we can replace missing values with the string "Not Specified".

In [None]:
df_dropped_columns['How did you hear about X Education'] = df_dropped_columns['How did you hear about X Education'].fillna('Not Specified')

* What is your current occupation:
For the 'What is your current occupation' column, we can replace missing values with the mode (most frequent value) since it's a categorical variable.

In [None]:
df_dropped_columns['What is your current occupation'] = df_dropped_columns['What is your current occupation'].fillna('Unemployed')

* What matters most to you in choosing a course:
Since the 'What matters most to you in choosing a course' column has a high percentage of missing values, it might be better to drop this column.

In [None]:
df_dropped_columns.drop('What matters most to you in choosing a course', axis=1, inplace=True)

* Lead Profile:
For the 'Lead Profile' column, we can replace missing values with the string "Not Specified".

In [None]:
df_dropped_columns['Lead Profile'] = df_dropped_columns['Lead Profile'].fillna('Not Specified')

* City: For the 'City' column, we can replace missing values with the mode (most frequent value) since it's a categorical variable.

In [None]:
df_dropped_columns['City'] = df_dropped_columns['City'].fillna(df_dropped_columns['City'].mode()[0])


In [None]:
#drop Prospect ID as its unique identifier
df_dropped_columns.drop(['Prospect ID'], 1, inplace = True)

In [None]:
#checking Null percentages
round(100*(df_dropped_columns.isnull().sum()/len(df_dropped_columns.index)), 2)

Lead Number                                 0.00
Lead Origin                                 0.00
Lead Source                                 0.39
Do Not Email                                0.00
Do Not Call                                 0.00
Converted                                   0.00
TotalVisits                                 1.48
Total Time Spent on Website                 0.00
Page Views Per Visit                        1.48
Last Activity                               1.11
Country                                     0.00
Specialization                              0.00
How did you hear about X Education          0.00
What is your current occupation             0.00
Search                                      0.00
Magazine                                    0.00
Newspaper Article                           0.00
X Education Forums                          0.00
Newspaper                                   0.00
Digital Advertisement                       0.00
Through Recommendati

In [None]:
#Now missing values are close to zero so we can drop them
df_dropped_columns.dropna(inplace = True)

In [None]:
#checking Null percentages
round(100*(df_dropped_columns.isnull().sum()/len(df_dropped_columns.index)), 2)

Lead Number                                 0.0
Lead Origin                                 0.0
Lead Source                                 0.0
Do Not Email                                0.0
Do Not Call                                 0.0
Converted                                   0.0
TotalVisits                                 0.0
Total Time Spent on Website                 0.0
Page Views Per Visit                        0.0
Last Activity                               0.0
Country                                     0.0
Specialization                              0.0
How did you hear about X Education          0.0
What is your current occupation             0.0
Search                                      0.0
Magazine                                    0.0
Newspaper Article                           0.0
X Education Forums                          0.0
Newspaper                                   0.0
Digital Advertisement                       0.0
Through Recommendations                 

In [None]:
df.shape

(9240, 37)

In [None]:
df = df_dropped_columns

In [None]:
df = df.replace('Select',np.nan)


In [None]:
def count_unique_values(dataframe):
    unique_counts = dataframe.nunique()
    unique_counts_df = pd.DataFrame({'Column': unique_counts.index, 'Unique Values': unique_counts.values})
    return unique_counts_df

# checking number if unique values
unique_values_df = count_unique_values(df)
print(unique_values_df)


                                    Column  Unique Values
0                              Lead Number           9074
1                              Lead Origin              4
2                              Lead Source             21
3                             Do Not Email              2
4                              Do Not Call              2
5                                Converted              2
6                              TotalVisits             41
7              Total Time Spent on Website           1717
8                     Page Views Per Visit            114
9                            Last Activity             17
10                                 Country             38
11                          Specialization             19
12      How did you hear about X Education             10
13         What is your current occupation              6
14                                  Search              2
15                       Newspaper Article              2
16            

In [None]:
def count_unique_values(dataframe):
    unique_counts = dataframe.nunique()
    unique_counts_df = pd.DataFrame({'Column': unique_counts.index, 'Unique Values': unique_counts.values})
    return unique_counts_df

def remove_columns_with_single_unique_value(dataframe):
    unique_counts_df = count_unique_values(dataframe)
    single_unique_columns = unique_counts_df[unique_counts_df['Unique Values'] == 1]['Column']
    updated_dataframe = dataframe.drop(single_unique_columns, axis=1)
    return updated_dataframe

updated_df = remove_columns_with_single_unique_value(df)



In [None]:
df= updated_df

In [None]:
def find_columns_with_mixed_datatypes(df):
    mixed_columns = []
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                pd.to_numeric(df[col])
            except ValueError:
                mixed_columns.append(col)
    return mixed_columns

# Usage example
mixed_columns = find_columns_with_mixed_datatypes(df)
print("Columns with mixed data types:")
print(mixed_columns)


Columns with mixed data types:
['Lead Origin', 'Lead Source', 'Do Not Email', 'Do Not Call', 'Last Activity', 'Country', 'Specialization', 'How did you hear about X Education', 'What is your current occupation', 'Search', 'Newspaper Article', 'X Education Forums', 'Newspaper', 'Digital Advertisement', 'Through Recommendations', 'Lead Profile', 'City', 'A free copy of Mastering The Interview', 'Last Notable Activity']


In [None]:
df.dtypes

Lead Number                                 int64
Lead Origin                                object
Lead Source                                object
Do Not Email                               object
Do Not Call                                object
Converted                                   int64
TotalVisits                               float64
Total Time Spent on Website                 int64
Page Views Per Visit                      float64
Last Activity                              object
Country                                    object
Specialization                             object
How did you hear about X Education         object
What is your current occupation            object
Search                                     object
Newspaper Article                          object
X Education Forums                         object
Newspaper                                  object
Digital Advertisement                      object
Through Recommendations                    object


<h1>Step 3 : Exploratory Data Analysis</h1>


* Univariate Analysis:

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("whitegrid")  # Set the style

plt.figure(figsize=(4, 3))  # Set the figure size

sns.boxplot(x=df['TotalVisits'], color='skyblue')  # Plot the box plot

plt.xlabel('Total Visits', fontsize=12)  # Set x-axis label
plt.ylabel('Count', fontsize=12)  # Set y-axis label
plt.title('Box Plot of Total Visits', fontsize=14)  # Set the plot title

plt.show()  # Show the plot


To remove outliers from the 'TotalVisits' column and plot the box plot again, we can use the concept of interquartile range (IQR) and set a threshold to identify and remove the outliers. 

In [None]:

# Calculate the IQR
Q1 = df['TotalVisits'].quantile(0.25)
Q3 = df['TotalVisits'].quantile(0.75)
IQR = Q3 - Q1

# Set the threshold to identify outliers
threshold = 1.5 * IQR

# Filter out the outliers
df_no_outliers = df[(df['TotalVisits'] >= Q1 - threshold) & (df['TotalVisits'] <= Q3 + threshold)]

# Plot the box plot without outliers
sns.set_style("whitegrid")
plt.figure(figsize=(4, 3))
sns.boxplot(x=df_no_outliers['TotalVisits'], color='skyblue')
plt.xlabel('Total Visits', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Box Plot of Total Visits (without outliers)', fontsize=14)

plt.show()


The box ranges from approximately 1 to 4, indicating that the majority of the Total Visits values fall within this range.
The median, which is located at around 3, suggests that 50% of the Total Visits values are below this point and 50% are above.
The absence of outliers in the plot after removing them suggests that extreme or unusual values in Total Visits have been filtered out.

In [None]:
df = df_no_outliers

In [None]:
plt.figure(figsize=(5, 3))
sns.boxplot(x=df['Total Time Spent on Website'])
plt.xlabel('Total Time Spent on Website')
plt.title('Box Plot of Total Time Spent on Website')
plt.show()

the box plot suggests that the majority of the values for 'Total Time Spent on Website' range from approximately 0 to 800, with the median value being around 250. This information provides insights into the distribution and central tendency of the variable.

Calculate measures such as mean, median, and standard deviation.

In [None]:
print('Mean:', df['Page Views Per Visit'].mean())
print('Median:', df['Page Views Per Visit'].median())
print('Standard Deviation:', df['Page Views Per Visit'].std())


* On average, visitors tend to view approximately 2.28 pages per visit.
* The median of 2.0 indicates that half of the visits have 2 or fewer page views, while the other half have more than 2 page views.
* The standard deviation of 1.96 suggests that there is a moderate amount of variability in the number of page views per visit. Some visits may have significantly fewer or more page views than the average.

In [None]:
plt.figure(figsize=(5, 3))
sns.set_style("whitegrid")

# Reorder the categories based on count
order = df['Lead Source'].value_counts().index

sns.countplot(x='Lead Source', data=df, order=order, palette='terrain')
plt.xlabel('Lead Source', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.title('Bar Plot of Lead Source', fontsize=14)
plt.xticks(rotation=90)
plt.show()


We can observe most of the traffic coming from Google or direct

In [None]:
plt.figure(figsize=(6, 6))
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99']

# Plot the pie chart without percentages
patches, _ = plt.pie(df['Lead Profile'].value_counts(), colors=colors)
plt.axis('equal')

# Add the percentages inside the legends
labels = [f'{label} ({percentage:.1f}%)' for label, percentage in zip(df['Lead Profile'].value_counts().index, df['Lead Profile'].value_counts(normalize=True) * 100)]
plt.legend(patches, labels, title='Lead Profile', loc='center')

plt.title('Pie Chart of Lead Profile')
plt.show()


The majority of leads have a profile labeled as "Select," indicating incomplete information. "Potential Lead" is the second most common profile, suggesting further nurturing is required. Other profiles, such as "Student of SomeSchool," "Lateral Student," and "Dual Specialization Student," represent smaller segments that may require tailored marketing approaches.

In [None]:
print(df['City'].value_counts())
print(df['City'].value_counts(normalize=True) * 100)


The majority of leads are from Mumbai, followed by a significant number of leads with the "Select" city label, indicating incomplete information. Thane & Outskirts, Other Cities, and Other Cities of Maharashtra also contribute to the lead count. Other Metro Cities and Tier II Cities have relatively smaller representation. It suggests a potential focus on targeting leads from Mumbai and further improving the quality and completeness of city information in the dataset.

* Bi variate Analysis:

In [None]:
plt.figure(figsize=(6, 3))
sns.boxplot(y='TotalVisits', x='Converted', data=df_no_outliers, palette='Set3')
plt.title('Box Plot of Total Visits by Conversion')
plt.xlabel('Converted')
plt.ylabel('Total Visits')
plt.xticks([0, 1], ['Not Converted', 'Converted'])
plt.ylim(bottom=0)  # Set the lower limit of the y-axis to 0
plt.grid(axis='y', linestyle='--', alpha=0.5)  # Add horizontal grid lines
plt.tight_layout()  # Adjust spacing between subplots
plt.show()


the median of the 'Converted' group is slightly higher than the 'Not Converted' group, it suggests that on average, the 'TotalVisits' tends to be slightly higher for the converted leads compared to the non-converted leads.

The vertical extent of the box indicates the range of values where the majority of the data points lie. For the 'Converted' group, the box extends from approximately 0 to 4.5, while for the 'Not Converted' group, the box extends from approximately 1 to 4. This means that the range of 'TotalVisits' values for the converted leads is slightly wider than the range for the non-converted leads.

In [None]:
# Group the data by total time spent and calculate the average conversion rate
conversion_rate_by_time = df.groupby('Total Time Spent on Website')['Converted'].mean().reset_index()

# Sort the data by total time spent in ascending order
conversion_rate_by_time = conversion_rate_by_time.sort_values('Total Time Spent on Website')

plt.figure(figsize=(7, 3))
sns.barplot(y='Total Time Spent on Website', x='Converted', data=conversion_rate_by_time, palette='viridis')
plt.xlabel('Total Time Spent on Website')
plt.ylabel('Conversion Rate')
plt.title('Conversion Rate by Total Time Spent on Website')
plt.xticks(rotation=90)
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: '{:.0f}'.format(x)))
plt.show()


we can observe the conversion rate increases significanltly for the users who spends more time on the website

In [None]:
# Calculate the conversion rate by last activity
conversion_rate_last_activity = df.groupby('Last Activity')['Converted'].mean().reset_index()

# Sort the dataframe by conversion rate in descending order
conversion_rate_last_activity = conversion_rate_last_activity.sort_values('Converted', ascending=False)

# Plot the conversion rate by last activity
plt.figure(figsize=(7, 3))
sns.barplot(x='Last Activity', y='Converted', data=conversion_rate_last_activity, palette='viridis_r')
plt.xlabel('Last Activity', fontsize=12)
plt.ylabel('Conversion Rate', fontsize=12)
plt.title('Conversion Rate by Last Activity', fontsize=14)
plt.xticks(rotation=90)
plt.ylim(0, 1)
plt.show()


The higher conversion rate for activities like "Approached upfront," "Email Marked Spam," "Resubscribed to emails," and "Emails Received" suggests that these activities have a positive impact on the likelihood of conversion. It indicates that leads who engage in these activities are more likely to be converted into customers. 

In [None]:
# Calculate the conversion rate by specialization
conversion_rate_by_specialization = df.groupby('Specialization')['Converted'].mean().sort_values(ascending=False)

# Plot the conversion rate by specialization
plt.figure(figsize=(7, 3))
sns.barplot(x=conversion_rate_by_specialization.index, y=conversion_rate_by_specialization.values, palette='viridis')
plt.xlabel('Specialization', fontsize=12)
plt.ylabel('Conversion Rate', fontsize=12)
plt.title('Conversion Rate by Specialization', fontsize=14)
plt.xticks(rotation=90)
plt.show()


We can see that most of the specialization has >30% conversion rate while some has as high as 50% conversion rate

In [None]:

# Calculate conversion rate by occupation status
conversion_rate_by_occupation = df.groupby('What is your current occupation')['Converted'].mean().sort_values(ascending=False)

# Create a bar plot
plt.figure(figsize=(7, 3))
sns.barplot(x=conversion_rate_by_occupation.index, y=conversion_rate_by_occupation.values, palette='coolwarm')
plt.xlabel('Occupation')
plt.ylabel('Conversion Rate')
plt.title('Conversion Rate by Occupation')
plt.xticks(rotation=90)
plt.show()


Housewives working porfessionals has pretty high conversion rates

In [None]:
# Set up subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(9, 4))
axes = axes.flatten()

# Iterate over each variable and create subplots
variables = ['Search', 'Newspaper Article', 'X Education Forums', 'Newspaper', 'Digital Advertisement', 'Through Recommendations']
for i, variable in enumerate(variables):
    ax = axes[i]
    sns.countplot(x=variable, hue='Converted', data=df, ax=ax, palette='turbo', edgecolor='gray')
    ax.set_xlabel(variable)
    ax.set_ylabel('Count')
    ax.set_title(f'Conversion by {variable}')
    ax.legend(title='Converted', loc='upper right')
    ax.set_xticklabels([])  # Remove x-axis ticks
    ax.set_ylim(0, df[variable].value_counts().max() * 1.1)  # Adjust the y-axis limit for better visualization
    ax.legend(title='Converted', loc='upper right')


# Adjust layout and show the plot
plt.tight_layout()
plt.show()


Conversion rates from across all the portals are more or les similar

<h1>Step 4 : Data Preparation</h1>


* New metrics creation

create a new metric called "Conversion Rate" based on the "Converted" and "TotalVisits" variables:

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum()

In [None]:
# Calculate Conversion Rate
df['Conversion Rate'] = df['Converted'] / df['TotalVisits'].replace(0, 1)
# Print the first few rows to verify the new metric

print(df[['Converted', 'TotalVisits', 'Conversion Rate']].head())

Page Views Per Visit (Normalized):

Normalizing the "Page Views Per Visit" metric allows us to bring the values within a common scale,

In [None]:
max_page_views = df['Page Views Per Visit'].max()
df['Page Views Per Visit (Normalized)'] = df['Page Views Per Visit'] / max_page_views

* Data Conversion

Converting binary variables: We have binary variables represented as strings ('Yes'/'No', 'True'/'False'), we can convert them to numerical representation (1/0),

Let's find out which of the columns have 2 unique values i.e Binary variables

In [None]:
# Identify binary variables
binary_variables = []
for column in df.columns:
    if df[column].nunique() == 2:
        binary_variables.append(column)

# Print the binary variables
print("Binary Variables:")
for var in binary_variables:
    print(var)


In [None]:
# Convert binary variables to 1/0
for column in binary_variables:
    df[column] = df[column].map({'Yes': 1, 'No': 0, 1: 1, 0: 0})

* Dummy variable creation

before creating Dummy columns we Consider the number of unique values in a column. If a column has a large number of unique values, creating dummy variables for all of them may result in a large number of columns and potential sparsity in the data.

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

In [None]:
# Calculate cardinality for each column
cardinality = df.select_dtypes(include='object').nunique()

# Filter columns based on the specified range (e.g., 5 <= cardinality <= 20)
min_threshold = 5
max_threshold = 20
categorical_cols = cardinality[(cardinality >= min_threshold) & (cardinality <= max_threshold)].index

# Generate dummy variables for categorical columns
dummy_df = pd.get_dummies(df[categorical_cols], drop_first=True)

# Concatenate the dummy variables with the original dataframe
df_encoded = pd.concat([df, dummy_df], axis=1)

# Drop the original categorical columns
df_encoded.drop(categorical_cols, axis=1, inplace=True)

In [None]:
df_encoded.head()

In [None]:
#check the shape of dataframe
df_encoded.shape

<h1>Step 5 : Train-Test Split</h1>


In [None]:
from sklearn.model_selection import train_test_split

# Separate the features (X) and target variable (y)
X = df_encoded.drop('Converted', axis=1)
y = df_encoded['Converted']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In this code, X represents the features (independent variables) of the dataset after encoding categorical variables, and y represents the target variable (dependent variable) that we want to predict (e.g., 'Converted' in this case). The test_size parameter determines the proportion of the data to be allocated for the test set (e.g., 0.2 for 20% test data), and random_state ensures reproducibility of the split.

<h1>Step 6:  Feature Scaling</h1>


Feature scaling is an important step in machine learning to standardize the range of independent variables or features in your dataset. It helps in bringing all the features to a similar scale, which can improve the performance of our machine learning algorithms.

To ensure that the StandardScaler is applied only to some of the important columns we are filtering the same and applying standard scale

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Create an instance of the StandardScaler
scaler = StandardScaler()
# Identify numerical columns
numerical_columns = X_train.select_dtypes(include=['int64', 'float64']).columns


X_train[numerical_columns].isnull().sum()

 # Scale the numerical features in X_train
X_train_scaled = scaler.fit_transform(X_train[numerical_columns])


<h1>Step 7:  Feature Selection Using RFE</h1>


In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Create an instance of Logistic Regression
logreg = LogisticRegression()

# Create an instance of RFE with desired number of features
rfe = RFE(estimator=logreg, n_features_to_select=20)

# Fit RFE on the training data
rfe.fit(X_train_scaled, y_train)

# Get the selected feature indices
selected_features_indices = rfe.get_support(indices=True)

# Get the selected feature names
selected_features = X_train.columns[selected_features_indices]

# Print the selected features
print("Selected Features:")
print(selected_features)


<h1>Step 8:  Model Building</h1>

* Create User defined function to create and print models, Summary & VIF


In [None]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

def build_model(X, y):
    # Add constant to the features
    X = sm.add_constant(X)
    
    # Fit the model
    model = sm.Logit(y, X)
    result = model.fit()
    
    # Print model summary
    print(result.summary())
    
    # Calculate VIF
    vif = pd.DataFrame()
    vif["Features"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    print("\nVariance Inflation Factor (VIF):")
    print(vif)



In [None]:
df.dtypes

In [None]:
df ['Magazine'].unique()

In [None]:


def find_columns_with_mixed_datatypes(df):
    mixed_columns = []
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                pd.to_numeric(df[col])
            except ValueError:
                mixed_columns.append(col)
    return mixed_columns

# Usage example
mixed_columns = find_columns_with_mixed_datatypes(df)
print("Columns with mixed data types:")
print(mixed_columns)


In [None]:
columns_with_mixed_types = ['Lead Origin', 'Lead Source', 'Last Activity', 'Country', 'Specialization', 'How did you hear about X Education', 'What is your current occupation', 'Magazine', 'X Education Forums', 'Receive More Updates About Our Courses', 'Update me on Supply Chain Content', 'Get updates on DM Content', 'Lead Profile', 'City', 'I agree to pay the amount through cheque', 'Last Notable Activity']

# Convert columns with mixed data types to string
df[columns_with_mixed_types] = df[columns_with_mixed_types].astype(str)

In [None]:
import numpy as np

X_train_selected_numeric = X_train[selected_features].apply(pd.to_numeric, errors='coerce')

# Replace missing values with column means
X_train_selected_numeric_filled = X_train_selected_numeric.fillna(X_train_selected_numeric.mean())

# Replace infinite values with NaN
X_train_selected_numeric_cleaned = X_train_selected_numeric_filled.replace([np.inf, -np.inf], np.nan)

# Drop rows with missing or invalid values
X_train_selected_numeric_cleaned = X_train_selected_numeric_cleaned.dropna()

if X_train_selected_numeric_cleaned.shape[0] > 0:
    build_model(X_train_selected_numeric_cleaned, y_train)
else:
    print("No rows remaining after data cleaning. Please review the data cleaning process.")

