In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

In [4]:
!wget $data

--2025-12-14 16:34:30--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80876 (79K) [text/plain]
Saving to: ‘course_lead_scoring.csv.1’


2025-12-14 16:34:31 (32.2 MB/s) - ‘course_lead_scoring.csv.1’ saved [80876/80876]



In [5]:
df = pd.read_csv(data)
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [6]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
string_columns = list(df.dtypes[df.dtypes == 'object'].index)
string_columns

['lead_source', 'industry', 'employment_status', 'location']

In [8]:
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')
    df[col] = df[col].str.replace('-','_')

In [9]:
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [10]:
df[string_columns] = df[string_columns].fillna('NA')
df[string_columns]

Unnamed: 0,lead_source,industry,employment_status,location
0,paid_ads,,unemployed,south_america
1,social_media,retail,employed,south_america
2,events,healthcare,unemployed,australia
3,paid_ads,retail,,australia
4,referral,education,self_employed,europe
...,...,...,...,...
1457,referral,manufacturing,self_employed,north_america
1458,referral,technology,student,europe
1459,paid_ads,technology,student,north_america
1460,referral,,self_employed,north_america


In [11]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [12]:
df['annual_income'] = df['annual_income'].fillna(0.0)
df['annual_income']

0       79450.0
1       46992.0
2       78796.0
3       83843.0
4       85012.0
         ...   
1457        0.0
1458    65259.0
1459    45688.0
1460    71016.0
1461    92855.0
Name: annual_income, Length: 1462, dtype: float64

In [13]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [14]:
df['industry'].value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [15]:
num_columns = df[['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']]
num_columns

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
0,1,79450.0,4,0.94
1,1,46992.0,1,0.80
2,5,78796.0,3,0.69
3,2,83843.0,1,0.87
4,3,85012.0,3,0.62
...,...,...,...,...
1457,1,0.0,4,0.53
1458,3,65259.0,2,0.24
1459,1,45688.0,3,0.02
1460,5,71016.0,0,0.25


In [16]:
cre = df[['interaction_count', 'lead_score']].corr()
cre

Unnamed: 0,interaction_count,lead_score
interaction_count,1.0,0.009888
lead_score,0.009888,1.0


In [17]:
cro = df[['number_of_courses_viewed', 'lead_score']].corr()
cro

Unnamed: 0,number_of_courses_viewed,lead_score
number_of_courses_viewed,1.0,-0.004879
lead_score,-0.004879,1.0


In [18]:
cri = df[['number_of_courses_viewed', 'interaction_count']].corr()
cri

Unnamed: 0,number_of_courses_viewed,interaction_count
number_of_courses_viewed,1.0,-0.023565
interaction_count,-0.023565,1.0


In [19]:
cru = df[['annual_income', 'interaction_count']].corr()
cru

Unnamed: 0,annual_income,interaction_count
annual_income,1.0,0.027036
interaction_count,0.027036,1.0


Annual income and interaction count have the highest correlation

In [38]:
#import scikit learn to set up validation framework
from sklearn.model_selection import train_test_split

In [39]:
#spliting the full data set into training and testing set
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [40]:
#splitting the training dataset into training and validation sets
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [42]:
y_train = df_train.converted.values
y_val = df_val.converted.values

In [43]:
df_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
1077,paid_ads,retail,0,58472.0,student,middle_east,5,0.03,0
463,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77,1
842,paid_ads,technology,3,81973.0,employed,north_america,2,0.59,1
835,,technology,1,74956.0,employed,europe,3,0.34,1
837,organic_search,retail,3,59335.0,student,australia,1,0.98,1
...,...,...,...,...,...,...,...,...,...
725,organic_search,other,1,43907.0,employed,australia,4,0.33,1
401,social_media,retail,3,64969.0,employed,north_america,1,0.18,0
957,,education,3,89042.0,employed,asia,4,0.75,1
992,social_media,manufacturing,1,0.0,self_employed,europe,1,0.65,0


In [44]:
#creating categorical columns group
categorical = df_train[['lead_source', 'industry', 'employment_status', 'location']]
categorical

Unnamed: 0,lead_source,industry,employment_status,location
1077,paid_ads,retail,student,middle_east
463,organic_search,manufacturing,student,middle_east
842,paid_ads,technology,employed,north_america
835,,technology,employed,europe
837,organic_search,retail,student,australia
...,...,...,...,...
725,organic_search,other,employed,australia
401,social_media,retail,employed,north_america
957,,education,employed,asia
992,social_media,manufacturing,self_employed,europe


In [45]:
from sklearn.metrics import mutual_info_score

In [46]:
#calculating the mutual information between converted column and other categorical columns
#wrapping mutal information method into one function
def calculate_mi(series):
    return mutual_info_score(series, df_train.converted)

#Applying the function to all categorical columns
df_mi = categorical.apply(calculate_mi)
df_mi.sort_values(ascending=False).round(2)

lead_source          0.04
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

The categorical column with the highest importance is lead_source

In [47]:
#deleting the converted columns in the train, test and validation dataset
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [48]:
#one hot encoding for categorical columns, turn into a dictionary and transform to a vector
from sklearn.feature_extraction import DictVectorizer

In [49]:
#turn training data into dictionary
df_train_dict = df_train.to_dict(orient='records')

In [50]:
df_train_dict[0]

{'lead_source': 'paid_ads',
 'industry': 'retail',
 'number_of_courses_viewed': 0,
 'annual_income': 58472.0,
 'employment_status': 'student',
 'location': 'middle_east',
 'interaction_count': 5,
 'lead_score': 0.03}

In [51]:
#create an instance of vectorizer
dv = DictVectorizer(sparse=False)

In [52]:
#fit and transform train data for one hot encoding
One_hot_df_train = dv.fit_transform(df_train_dict)
One_hot_df_train

array([[5.8472e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [7.1738e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [8.1973e+04, 0.0000e+00, 1.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        3.0000e+00],
       ...,
       [8.9042e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [5.0259e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        4.0000e+00]], shape=(876, 31))

In [53]:
df_val_dict = df_val.to_dict(orient='records')

In [57]:
#only transform validation data for one hot encoding
One_hot_df_val = dv.transform(df_val_dict)
One_hot_df_val

array([[5.2220e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [5.9656e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [5.7134e+04, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [7.4166e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [3.9103e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00],
       [4.7129e+04, 1.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00]], shape=(293, 31))

In [58]:
#training with logistic regression
from sklearn.linear_model import LogisticRegression

In [59]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(One_hot_df_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [61]:
#to get the soft predeictions for the train dataset
model.predict_proba(One_hot_df_train)

array([[0.42085658, 0.57914342],
       [0.12716509, 0.87283491],
       [0.41183895, 0.58816105],
       ...,
       [0.25265786, 0.74734214],
       [0.3302157 , 0.6697843 ],
       [0.14407824, 0.85592176]], shape=(876, 2))

In [66]:
#getting prediction on validation dataset
y_pred = model.predict_proba(One_hot_df_val)[:, 1]
y_pred

array([0.61192162, 0.79982616, 0.53021342, 0.47131479, 0.5706613 ,
       0.44227166, 0.87127669, 0.84883114, 0.83290037, 0.614978  ,
       0.54968025, 0.78153087, 0.69039784, 0.77017121, 0.52659438,
       0.91706424, 0.53170633, 0.42123047, 0.30146454, 0.84881582,
       0.79488652, 0.73670373, 0.44527209, 0.64838383, 0.41768818,
       0.75393417, 0.90166115, 0.33903047, 0.43181429, 0.9680681 ,
       0.92018714, 0.37487987, 0.65230099, 0.90650057, 0.75164115,
       0.6420212 , 0.82250074, 0.83375553, 0.65911599, 0.30978853,
       0.78942264, 0.35546364, 0.96517758, 0.63389304, 0.51274194,
       0.53230532, 0.82287784, 0.744074  , 0.73452312, 0.68955216,
       0.46964441, 0.84539251, 0.55635242, 0.92637871, 0.65258021,
       0.61526271, 0.63816994, 0.28304016, 0.48049823, 0.57890616,
       0.35497341, 0.62175049, 0.38960775, 0.61156055, 0.85304277,
       0.75430135, 0.89185953, 0.71946457, 0.95387623, 0.89209517,
       0.75277086, 0.33850137, 0.61376593, 0.51622273, 0.64088

In [69]:
#getting the rows of converted 
converted_decision = (y_pred >= 0.5)
converted_decision

array([ True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True, False,  True, False,  True,  True,
       False, False,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True, False, False,  True, False,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True, False,  True,
       False,  True,  True, False,  True,  True, False,  True,  True,
       False,  True,

In [85]:
#getting the accuracy of the model
(y_val == converted_decision).mean()

np.float64(0.7030716723549488)

The Accuracy on the validation data is 0.7 (rounded to 2 decimals)

In [74]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'annual_income': np.float64(-0.0),
 'employment_status=NA': np.float64(-0.015),
 'employment_status=employed': np.float64(0.034),
 'employment_status=self_employed': np.float64(0.003),
 'employment_status=student': np.float64(0.012),
 'employment_status=unemployed': np.float64(-0.103),
 'industry=NA': np.float64(-0.025),
 'industry=education': np.float64(0.049),
 'industry=finance': np.float64(-0.02),
 'industry=healthcare': np.float64(-0.013),
 'industry=manufacturing': np.float64(-0.003),
 'industry=other': np.float64(-0.009),
 'industry=retail': np.float64(-0.032),
 'industry=technology': np.float64(-0.016),
 'interaction_count': np.float64(0.311),
 'lead_score': np.float64(0.051),
 'lead_source=NA': np.float64(0.02),
 'lead_source=events': np.float64(-0.012),
 'lead_source=organic_search': np.float64(-0.012),
 'lead_source=paid_ads': np.float64(-0.115),
 'lead_source=referral': np.float64(0.08),
 'lead_source=social_media': np.float64(-0.03),
 'location=NA': np.float64(0.004),
 'l

In [75]:
#Elimnating features one by one to find the least useful feature
df_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
1077,paid_ads,retail,0,58472.0,student,middle_east,5,0.03
463,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77
842,paid_ads,technology,3,81973.0,employed,north_america,2,0.59
835,,technology,1,74956.0,employed,europe,3,0.34
837,organic_search,retail,3,59335.0,student,australia,1,0.98
...,...,...,...,...,...,...,...,...
725,organic_search,other,1,43907.0,employed,australia,4,0.33
401,social_media,retail,3,64969.0,employed,north_america,1,0.18
957,,education,3,89042.0,employed,asia,4,0.75
992,social_media,manufacturing,1,0.0,self_employed,europe,1,0.65


In [106]:
#subset without lead_source
subset_without_lead_source = ['industry', 'number_of_courses_viewed', 'annual_income', 'employment_status', 'location', 'interaction_count', 'lead_score']
df_train[subset_without_lead_source]

Unnamed: 0,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
1077,retail,0,58472.0,student,middle_east,5,0.03
463,manufacturing,3,71738.0,student,middle_east,6,0.77
842,technology,3,81973.0,employed,north_america,2,0.59
835,technology,1,74956.0,employed,europe,3,0.34
837,retail,3,59335.0,student,australia,1,0.98
...,...,...,...,...,...,...,...
725,other,1,43907.0,employed,australia,4,0.33
401,retail,3,64969.0,employed,north_america,1,0.18
957,education,3,89042.0,employed,asia,4,0.75
992,manufacturing,1,0.0,self_employed,europe,1,0.65


In [116]:
#Converting train dataset to dictionary
train_dict_without_lead_source = df_train[subset_without_lead_source].to_dict(orient='records')
dv_without_lead_source = DictVectorizer(sparse=False)
One_hot_df_train_without_lead_source = dv_without_lead_source.fit_transform(train_dict_without_lead_source)
#for validation dataset
df_val_dict_without_lead_source = df_val[subset_without_lead_source].to_dict(orient='records')
One_hot_df_val_without_lead_source = dv_without_lead_source.transform(df_val_dict_without_lead_source)

In [117]:
#training the model without lead_source
model_without_lead_source = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_without_lead_source.fit(One_hot_df_train_without_lead_source, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [118]:
#getting the prediction with the validation dataset
y_pred_without_lead_source = model_without_lead_source.predict_proba(One_hot_df_val_without_lead_source)[:, 1]

In [119]:
#getting the accuracy without lead_source
converted_decision_without_lead_source = (y_pred_without_lead_source >= 0.5)
(y_val == converted_decision_without_lead_source).mean()

np.float64(0.7030716723549488)

In [113]:
#subset without employment_status
subset_without_emp_status = ['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income', 'location', 'interaction_count', 'lead_score']
df_train[subset_without_emp_status]

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,location,interaction_count,lead_score
1077,paid_ads,retail,0,58472.0,middle_east,5,0.03
463,organic_search,manufacturing,3,71738.0,middle_east,6,0.77
842,paid_ads,technology,3,81973.0,north_america,2,0.59
835,,technology,1,74956.0,europe,3,0.34
837,organic_search,retail,3,59335.0,australia,1,0.98
...,...,...,...,...,...,...,...
725,organic_search,other,1,43907.0,australia,4,0.33
401,social_media,retail,3,64969.0,north_america,1,0.18
957,,education,3,89042.0,asia,4,0.75
992,social_media,manufacturing,1,0.0,europe,1,0.65


In [114]:
#Converting train dataset to dictionary
train_dict_without_emp_status = df_train[subset_without_emp_status].to_dict(orient='records')
dv_without_emp_status = DictVectorizer(sparse=False)
One_hot_df_train_without_emp_status = dv_without_emp_status.fit_transform(train_dict_without_emp_status)
#for validation dataset
df_val_dict_without_emp_status = df_val[subset_without_emp_status].to_dict(orient='records')
One_hot_df_val_without_emp_status = dv_without_emp_status.transform(df_val_dict_without_emp_status)

In [120]:
#training the model without emp status
model_without_emp_status = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_without_emp_status.fit(One_hot_df_train_without_emp_status, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [121]:
#getting the prediction with the validation dataset
y_pred_without_emp_status = model_without_emp_status.predict_proba(One_hot_df_val_without_emp_status)[:, 1]

In [122]:
#getting the accuracy without emp status
converted_decision_without_emp_status = (y_pred_without_emp_status >= 0.5)
(y_val == converted_decision_without_emp_status).mean()

np.float64(0.6962457337883959)

In [123]:
#subset without industry
subset_without_industry = ['lead_source', 'number_of_courses_viewed', 'annual_income','employment_status', 'location', 'interaction_count', 'lead_score']
df_train[subset_without_industry]

Unnamed: 0,lead_source,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
1077,paid_ads,0,58472.0,student,middle_east,5,0.03
463,organic_search,3,71738.0,student,middle_east,6,0.77
842,paid_ads,3,81973.0,employed,north_america,2,0.59
835,,1,74956.0,employed,europe,3,0.34
837,organic_search,3,59335.0,student,australia,1,0.98
...,...,...,...,...,...,...,...
725,organic_search,1,43907.0,employed,australia,4,0.33
401,social_media,3,64969.0,employed,north_america,1,0.18
957,,3,89042.0,employed,asia,4,0.75
992,social_media,1,0.0,self_employed,europe,1,0.65


In [124]:
#Converting train dataset to dictionary
train_dict_without_industry = df_train[subset_without_industry].to_dict(orient='records')
dv_without_industry = DictVectorizer(sparse=False)
One_hot_df_train_without_industry = dv_without_industry.fit_transform(train_dict_without_industry)
#for validation dataset
df_val_dict_without_industry = df_val[subset_without_industry].to_dict(orient='records')
One_hot_df_val_without_industry = dv_without_industry.transform(df_val_dict_without_industry)

In [125]:
#training the model without industry
model_without_industry = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_without_industry.fit(One_hot_df_train_without_industry, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [126]:
#getting the prediction with the validation dataset
y_pred_without_industry = model_without_industry.predict_proba(One_hot_df_val_without_industry)[:, 1]

In [127]:
#getting the accuracy without industry
converted_decision_without_industry = (y_pred_without_industry >= 0.5)
(y_val == converted_decision_without_industry).mean()

np.float64(0.6996587030716723)

The feature with the smallest difference is 'Lead_score'

In [136]:
#Training a regularized logistic regression
#Converting train dataset to dictionary
train_dict_reg = df_train.to_dict(orient='records')
dv_reg = DictVectorizer(sparse=False)
One_hot_df_train_reg = dv_reg.fit_transform(train_dict_reg)
#for validation dataset
df_val_dict_reg = df_val.to_dict(orient='records')
One_hot_df_val_reg = dv_reg.transform(df_val_dict_reg)

In [137]:
#training with C = 0.01
model_reg_001 = LogisticRegression(solver='liblinear', C=0.01, max_iter=1000, random_state=42)
model_reg_001.fit(One_hot_df_train_reg, y_train)
#getting the prediction with the validation dataset
y_pred_reg_001 = model_reg_001.predict_proba(One_hot_df_val_reg)[:, 1]
#getting the accuracy without industry
converted_decision_reg_001 = (y_pred_reg_001 >= 0.5)
(y_val == converted_decision_reg_001).mean()

np.float64(0.6996587030716723)

In [138]:
#training with C = 0.1
model_reg_01 = LogisticRegression(solver='liblinear', C=0.1, max_iter=1000, random_state=42)
model_reg_01.fit(One_hot_df_train_reg, y_train)
#getting the prediction with the validation dataset
y_pred_reg_01 = model_reg_01.predict_proba(One_hot_df_val_reg)[:, 1]
#getting the accuracy without industry
converted_decision_reg_01 = (y_pred_reg_01 >= 0.5)
(y_val == converted_decision_reg_01).mean()

np.float64(0.6996587030716723)

In [139]:
#training with 1
model_reg_1 = LogisticRegression(solver='liblinear', C=1, max_iter=1000, random_state=42)
model_reg_1.fit(One_hot_df_train_reg, y_train)
#getting the prediction with the validation dataset
y_pred_reg_1 = model_reg_1.predict_proba(One_hot_df_val_reg)[:, 1]
#getting the accuracy without industry
converted_decision_reg_1 = (y_pred_reg_01 >= 0.5)
(y_val == converted_decision_reg_1).mean()

np.float64(0.6996587030716723)

In [None]:
#training with 10
model_reg_1 = LogisticRegression(solver='liblinear', C=1, max_iter=1000, random_state=42)
model_reg_1.fit(One_hot_df_train_reg, y_train)
#getting the prediction with the validation dataset
y_pred_reg_1 = model_reg_1.predict_proba(One_hot_df_val_reg)[:, 1]
#getting the accuracy without industry
converted_decision_reg_1 = (y_pred_reg_01 >= 0.5)
(y_val == converted_decision_reg_1).mean()