In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Step 1: Load Data
def load_data(file_path):
    df = pd.read_csv(file_path)
    print(df.info())
    return df

# Step 2: Select Columns
def select_columns(df, columns):
    selected_df = df[columns]
    print(selected_df.head())
    return selected_df

# Step 3: Handle Missing Values
def fill_missing_country(df):
    # Extract unique City-Country mapping
    city_country_map = df.dropna(subset=['City', 'Country']).set_index('City')['Country'].to_dict()
    # Fill missing 'Country' values based on 'City'
    df['Country'] = df.apply(lambda row: city_country_map.get(row['City'], row['Country']), axis=1)
    print(city_country_map)
    return df

def handle_missing_specialization(df):
    # Fill missing values in 'Specialization'
    df['Specialization'] = df['Specialization'].fillna('Select')
    return df

# Step 4: Generate New Columns
def add_decision_authority(df):
    authority_levels = [5, 4, 3, 2, 1]
    probabilities = [0.05, 0.15, 0.25, 0.35, 0.20]
    df['Decision_Authority'] = np.random.choice(authority_levels, size=len(df), p=probabilities)
    return df

def add_job_position(df):
    job_titles_map = {
        5: ['CEO', 'CFO', 'CTO', 'COO', 'Chief Innovation Officer'],
        4: ['Vice President', 'Director of Operations', 'Head of Marketing', 'Senior Strategy Director'],
        3: ['Project Manager', 'Marketing Manager', 'Team Lead', 'Operations Manager'],
        2: ['Data Analyst', 'Software Engineer', 'Business Consultant', 'Technical Specialist'],
        1: ['Intern', 'Junior Developer', 'Assistant Coordinator', 'Associate Analyst']
    }
    df['Job Position'] = df['Decision_Authority'].apply(lambda x: np.random.choice(job_titles_map.get(x, ['Unknown'])))
    return df


def extract_decision_authority(df, job_titles_map):
    """
    Reads the 'Job Position' column and extracts the corresponding Decision Authority level
    based on the provided job_titles_map, creating a new column.

    Args:
    df (pd.DataFrame): The DataFrame containing the 'Job Position' column.
    job_titles_map (dict): A dictionary mapping Decision Authority levels to job titles.

    Returns:
    pd.DataFrame: The DataFrame with a new 'Extracted Decision Authority' column.
    """
    # Reverse the mapping: Job Title → Decision Authority
    reversed_map = {}
    for authority, titles in job_titles_map.items():
        for title in titles:
            reversed_map[title] = authority

    # Extract Decision Authority using the reversed mapping
    df['Extracted Decision Authority'] = df['Job Position'].map(reversed_map).fillna(0)  # Fill unknown titles with 0 or any default

    return df



# Step 5: Encode Binary Columns
def encode_binary_columns(df, columns, encoding_rules):
    for col in columns:
        encoded_col = f"{col} Encoded"
        df[encoded_col] = df[col].str.lower().map(encoding_rules).fillna(0)
    df.drop(columns, axis=1, inplace=True)
    return df

# Step 6: One-Hot Encoding
def one_hot_encode(df, columns):
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    encoded_data = encoder.fit_transform(df[columns])
    encoded_df = pd.DataFrame(
        encoded_data,
        columns=encoder.get_feature_names_out(columns)
    )
    df = pd.concat([df.drop(columns, axis=1), encoded_df], axis=1)
    return df

# Step 7: Split Data
def split_data(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    return X, y

# Main Pipeline
def main_pipeline(file_path):
    # Define job_titles_map
    job_titles_map = {
        5: ['CEO', 'CFO', 'CTO', 'COO', 'Chief Innovation Officer'],
        4: ['Vice President', 'Director of Operations', 'Head of Marketing', 'Senior Strategy Director'],
        3: ['Project Manager', 'Marketing Manager', 'Team Lead', 'Operations Manager'],
        2: ['Data Analyst', 'Software Engineer', 'Business Consultant', 'Technical Specialist'],
        1: ['Intern', 'Junior Developer', 'Assistant Coordinator', 'Associate Analyst']
    }
    selected_columns = [
        'Lead Source', 'Do Not Email', 'Do Not Call', 'Country', 'City','Specialization',
         'Through Recommendations', 'Job Position'
         , 'Last Notable Activity', 'Converted'
    ]


    # Load and preprocess data
    df = load_data(file_path)
    df = add_decision_authority(df)
    df = add_job_position(df)
    df = select_columns(df, selected_columns)
    df = fill_missing_country(df)
    df = handle_missing_specialization(df)
    df = extract_decision_authority(df,job_titles_map)

    print(df.head(5))

    encoding_rules = {'yes': 1, 'y': 1, 'no': 0, 'n': 0}
    binary_columns = ['Through Recommendations', 'Do Not Call', 'Do Not Email']
    df = encode_binary_columns(df, binary_columns, encoding_rules)

    #print(df.head(5))

    # categorical_columns = ['Lead Source', 'Country', 'Specialization', 'City', 'Last Notable Activity']
    # df = one_hot_encode(df, categorical_columns)
    #
    # # Save processed data
    # df.to_csv('processed_leads.csv', index=False)
    #
    # # Split data for modeling
    # X, y = split_data(df, 'Converted')
    # return X, y

file_path = r'C:\Users\islam\Odoo Intern\CRM Project\CRM-Sales-Prediction\archive\Lead Scoring.csv'
main_pipeline(file_path)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 

In [34]:
# Main Pipeline
def main_pipeline(file_path):
    # Load and preprocess data
    df = load_data(file_path)
    df = add_decision_authority(df)
    df = add_job_position(df)

    df = df.drop(['Decision_Authority'], axis=1)


    selected_columns = [
        'Lead Source', 'Do Not Email', 'Do Not Call', 'Country', 'Specialization',
        'Job Position', 'Through Recommendations', 'City', 'Last Notable Activity', 'Converted'
    ]


    df = select_columns(df, selected_columns)

    print(df.head(5))
    print(df.isnull().sum())


    df = fill_missing_country(df)
    print(df.isnull().sum())

    df = handle_missing_specialization(df)
    print(df.isnull().sum())


    df = extract_decision_authority(df,job_titles_map)
    print(df['Extracted Decision Authority'])
    df = df.drop(['Job Position'], axis=1)




    encoding_rules = {'yes': 1, 'y': 1, 'no': 0, 'n': 0}
    binary_columns = ['Through Recommendations', 'Do Not Call', 'Do Not Email']
    df = encode_binary_columns(df, binary_columns, encoding_rules)

    categorical_columns = ['Lead Source', 'Country', 'Specialization', 'City', 'Last Notable Activity']
    df = one_hot_encode(df, categorical_columns)

    # Save processed data
    df.to_csv('processed_leads.csv', index=False)

    print(df.head())





   # Define job_titles_map
job_titles_map = {
    5: ['CEO', 'CFO', 'CTO', 'COO', 'Chief Innovation Officer'],
    4: ['Vice President', 'Director of Operations', 'Head of Marketing', 'Senior Strategy Director'],
    3: ['Project Manager', 'Marketing Manager', 'Team Lead', 'Operations Manager'],
    2: ['Data Analyst', 'Software Engineer', 'Business Consultant', 'Technical Specialist'],
    1: ['Intern', 'Junior Developer', 'Assistant Coordinator', 'Associate Analyst']
}






file_path = r'C:\Users\islam\Odoo Intern\CRM Project\CRM-Sales-Prediction\archive\Lead Scoring.csv'


main_pipeline(file_path)






<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 37 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Prospect ID                                    9240 non-null   object 
 1   Lead Number                                    9240 non-null   int64  
 2   Lead Origin                                    9240 non-null   object 
 3   Lead Source                                    9204 non-null   object 
 4   Do Not Email                                   9240 non-null   object 
 5   Do Not Call                                    9240 non-null   object 
 6   Converted                                      9240 non-null   int64  
 7   TotalVisits                                    9103 non-null   float64
 8   Total Time Spent on Website                    9240 non-null   int64  
 9   Page Views Per Visit                           9103 

PermissionError: [Errno 13] Permission denied: 'processed_leads.csv'

Imports

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  roc_auc_score
from sklearn.datasets import make_moons
from sklearn.ensemble import GradientBoostingClassifier




Splitting Data

In [28]:

df = load_data(r'C:\Users\islam\Odoo Intern\CRM Project\CRM-Sales-Prediction\archive\processed_leads.csv')

y = df['Converted']
X = df.drop('Converted', axis=1)  # All features except target



X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,    # 20% for testing
    random_state=42,  # Ensures reproducibility
    stratify=y        # Preserves class balance in splits
)# Targ

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9240 entries, 0 to 9239
Data columns (total 78 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Converted                                           9240 non-null   int64  
 1   Extracted Decision Authority                        9240 non-null   int64  
 2   Through Recommendations Encoded                     9240 non-null   int64  
 3   Do Not Call Encoded                                 9240 non-null   int64  
 4   Do Not Email Encoded                                9240 non-null   int64  
 5   Lead Source_Direct Traffic                          9240 non-null   float64
 6   Lead Source_Facebook                                9240 non-null   float64
 7   Lead Source_Google                                  9240 non-null   float64
 8   Lead Source_Live Chat                               9240 non-null   float64
 9

In [32]:
print(X_train.head(5))
y_train.head(5)

      Extracted Decision Authority  Through Recommendations Encoded  \
9067                             4                                0   
6093                             3                                0   
855                              5                                0   
6053                             4                                0   
292                              3                                0   

      Do Not Call Encoded  Do Not Email Encoded  Lead Source_Direct Traffic  \
9067                    0                     0                         0.0   
6093                    0                     0                         0.0   
855                     0                     0                         0.0   
6053                    0                     0                         0.0   
292                     0                     0                         1.0   

      Lead Source_Facebook  Lead Source_Google  Lead Source_Live Chat  \
9067                   0.

9067    1
6093    0
855     1
6053    0
292     0
Name: Converted, dtype: int64

Logistic Regression

In [29]:
model = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

print(classification_report(y_test, y_pred))
# 3. Get probabilities
y_prob = model.predict_proba(X_test)[:, 1]  # P(conversion)

# 4. Add probabilities back to DataFrame (for analysis)
test_df = X_test.copy()
test_df['conversion_prob'] = y_prob
test_df['actual'] = y_test  # Optional: Compare with ground truth

print(test_df[['conversion_prob', 'actual']].head(20))



Accuracy: 0.7310606060606061
              precision    recall  f1-score   support

           0       0.74      0.87      0.80      1136
           1       0.71      0.51      0.59       712

    accuracy                           0.73      1848
   macro avg       0.72      0.69      0.70      1848
weighted avg       0.73      0.73      0.72      1848

              precision    recall  f1-score   support

           0       0.74      0.87      0.80      1136
           1       0.71      0.51      0.59       712

    accuracy                           0.73      1848
   macro avg       0.72      0.69      0.70      1848
weighted avg       0.73      0.73      0.72      1848

      conversion_prob  actual
683          0.374197       1
1931         0.216412       1
6950         0.688843       0
2996         0.307172       0
3902         0.272824       0
6828         0.313542       0
4906         0.489472       0
5796         0.201957       0
6388         0.088975       0
3636         0.14

RandomForests

In [30]:
# Initialize and train
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
# Get raw probabilities (uncalibrated)

# Get raw probabilities (uncalibrated)
y_prob_uncalibrated = rf.predict_proba(X_test)[:, 1]

print(y_prob_uncalibrated)

# 4. Add probabilities back to DataFrame (for analysis)
test_df = X_test.copy()
test_df['conversion_prob'] = y_prob_uncalibrated
test_df['actual'] = y_test  # Optional: Compare with ground truth

print(test_df[['conversion_prob', 'actual']].head(20))

Accuracy: 0.7175324675324676
              precision    recall  f1-score   support

           0       0.76      0.80      0.78      1136
           1       0.65      0.59      0.62       712

    accuracy                           0.72      1848
   macro avg       0.70      0.69      0.70      1848
weighted avg       0.71      0.72      0.71      1848

[0.54       0.1525203  0.25185897 ... 0.22514313 0.10543651 0.12456091]
      conversion_prob  actual
683          0.540000       1
1931         0.152520       1
6950         0.251859       0
2996         0.255325       0
3902         0.364667       0
6828         0.792286       0
4906         0.901374       0
5796         0.010000       0
6388         0.080737       0
3636         0.013270       0
3685         0.603223       1
3279         0.148870       0
3799         0.456162       0
9158         0.561009       1
5766         0.509620       0
4341         0.061833       1
3043         0.058696       0
1528         0.809705       1
18

Trying out both models on synthetic non-linear dataset, RF outperformed LR as we see.

In [None]:

from sklearn.metrics import  roc_auc_score
from sklearn.datasets import make_moons
X_nl, y_nl = make_moons(n_samples=1000, noise=0.3, random_state=42)

# Train both models
lr_nl = LogisticRegression().fit(X_nl, y_nl)
rf_nl = RandomForestClassifier(random_state=42).fit(X_nl, y_nl)

# Compare AUC
print("LR AUC (Non-linear):", roc_auc_score(y_nl, lr_nl.predict_proba(X_nl)[:, 1]))  # Likely poor
print("RF AUC (Non-linear):", roc_auc_score(y_nl, rf_nl.predict_proba(X_nl)[:, 1]))  # Should be >0.9

GradientBoosting

In [31]:
from sklearn.ensemble import GradientBoostingClassifier


# Initialize the Gradient Boosting Regressor
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model to your training data
gb_model.fit(X_train, y_train)

# Make predictions
y_pred = gb_model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Get predicted probabilities
proba = gb_model.predict_proba(X_test)

# Extract the probability of the positive class (e.g., lead becoming a sale)
lead_sale_prob = proba[:, 1]

# 4. Add probabilities back to DataFrame (for analysis)
test_df = X_test.copy()
test_df['conversion_prob'] = y_prob
test_df['actual'] = y_test  # Optional: Compare with ground truth

print(test_df[['conversion_prob', 'actual']].head(20))





Accuracy: 0.735
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.86      0.80      1136
           1       0.71      0.53      0.61       712

    accuracy                           0.73      1848
   macro avg       0.73      0.70      0.70      1848
weighted avg       0.73      0.73      0.73      1848

      conversion_prob  actual
683          0.374197       1
1931         0.216412       1
6950         0.688843       0
2996         0.307172       0
3902         0.272824       0
6828         0.313542       0
4906         0.489472       0
5796         0.201957       0
6388         0.088975       0
3636         0.145730       0
3685         0.651288       1
3279         0.104081       0
3799         0.454617       0
9158         0.770943       1
5766         0.431324       0
4341         0.341087       1
3043         0.106813       0
1528         0.760533       1
1811         0.177686       0
900          0.284213       1


Considering that all the models have similar outputs. Random forests seem to be an optimal choice since they capture non-linearity and are less computationally expensive than gradient boosting. Logistic regression yielded good scores but, in the future, if the dataset is more complex, it will probably fail to capture its patterns.

We will further fine-tune and calibrate it.

In [None]:
# # first accuracy:  0.9128
# first f1: 0.88


#  2nd accuracy: 0.9134 , n_estimators=200
#  2nd f1: 0.89

# 3rd accuracy:0.9139  , n_estimators=500
# 3rd f1: 0.89

# according to available litrature, the obtained f1 score is considered good.

from sklearn.ensemble import RandomForestClassifier

# Initialize and train
rf = RandomForestClassifier(n_estimators=500, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
# Get raw probabilities (uncalibrated)

Calibrating Random Forests