In [3]:
import pandas as pd 
import numpy as np

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv')

In [5]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [6]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [8]:
df[df.select_dtypes(include='object').columns] = df.select_dtypes(include='object').fillna('NA')
df.isnull().sum()

lead_source                   0
industry                      0
number_of_courses_viewed      0
annual_income               181
employment_status             0
location                      0
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [9]:
df[df.select_dtypes(include='float64').columns] = df.select_dtypes(include='float64').fillna('0')
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [10]:
mode_industry = df['industry'].mode()[0]
print (mode_industry)

retail


In [11]:
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']


In [12]:
df[numerical]

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
0,1,79450.0,4,0.94
1,1,46992.0,1,0.80
2,5,78796.0,3,0.69
3,2,83843.0,1,0.87
4,3,85012.0,3,0.62
...,...,...,...,...
1457,1,0,4,0.53
1458,3,65259.0,2,0.24
1459,1,45688.0,3,0.02
1460,5,71016.0,0,0.25


In [13]:
df[numerical].corr()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [14]:
from sklearn.model_selection import train_test_split 

In [15]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [16]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [17]:
len(df_full_train), len(df_test), len(df_val)

(1169, 293, 293)

In [18]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [19]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [20]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [21]:
df_train

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score
0,paid_ads,retail,0,58472.0,student,middle_east,5,0.03
1,organic_search,manufacturing,3,71738.0,student,middle_east,6,0.77
2,paid_ads,technology,3,81973.0,employed,north_america,2,0.59
3,,technology,1,74956.0,employed,europe,3,0.34
4,organic_search,retail,3,59335.0,student,australia,1,0.98
...,...,...,...,...,...,...,...,...
871,organic_search,other,1,43907.0,employed,australia,4,0.33
872,social_media,retail,3,64969.0,employed,north_america,1,0.18
873,,education,3,89042.0,employed,asia,4,0.75
874,social_media,manufacturing,1,0,self_employed,europe,1,0.65


In [22]:
from sklearn.metrics import mutual_info_score

In [23]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']

In [24]:
def mutual_info_converted_score(c):
    return round(mutual_info_score(c, df_full_train.converted),2)

In [25]:
df_full_train[categorical].apply(mutual_info_converted_score)

lead_source          0.03
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

In [26]:
from sklearn.feature_extraction import DictVectorizer

In [27]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')


In [28]:
dv = DictVectorizer(sparse=False)



In [29]:
X_train = dv.fit_transform(train_dicts)

In [30]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [31]:
X_val = dv.fit_transform(val_dicts)

In [32]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)


In [35]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [36]:
model.intercept_[0]

np.float64(-0.063871454675127)

In [37]:
model.coef_[0].round(3)

array([-0.   , -0.033, -0.014,  0.033,  0.003,  0.012, -0.098, -0.024,
        0.048, -0.019, -0.012, -0.003, -0.009, -0.03 , -0.015,  0.317,
        0.051,  0.02 , -0.011, -0.011, -0.111,  0.077, -0.028,  0.004,
       -0.011, -0.011, -0.005,  0.009,  0.006, -0.032, -0.024,  0.45 ])

In [38]:
y_pred = model.predict_proba(X_val)[:,1]

In [39]:
conv_decision = (y_pred > 0.5)

In [40]:
(y_val == conv_decision).mean()

np.float64(0.6996587030716723)

In [41]:
from sklearn.metrics import accuracy_score

In [42]:
y_pred_class = (y_pred >= 0.5).astype(int)

In [43]:
acc = accuracy_score(y_val, y_pred_class)

print("Validation accuracy:", round(acc, 2))

Validation accuracy: 0.7


In [45]:
original_acc = acc
features_t = ['industry', 'employment_status', 'lead_score']
results = []
diffs = {}



In [46]:
for i, f in enumerate(feature_names):
    # Supprimer la colonne i
    X_train_drop = np.delete(X_train, i, axis=1)
    X_val_drop = np.delete(X_val, i, axis=1)
    
    # Entraîner le modèle
    m = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    m.fit(X_train_drop, y_train)
    
    # Accuracy sur la validation
    y_pred = m.predict(X_val_drop)
    acc_drop = accuracy_score(y_val, y_pred)
    
    # Différence avec l'original
    diff = original_acc - acc_drop
    results.append((f, round(acc_drop, 3), round(diff, 3)))


NameError: name 'feature_names' is not defined

In [47]:
df_results = pd.DataFrame(results, columns=['feature', 'accuracy', 'diff'])
print(df_results.sort_values('diff'))

Empty DataFrame
Columns: [feature, accuracy, diff]
Index: []


In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Valeurs de C à tester
C_values = [0.01, 0.1, 1, 10, 100]

acc_C = {}

for C in C_values:
    # Créer et entraîner le modèle
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)  # ou df_train si on a utilisé get_dummies
    
    # Prédiction sur validation
    y_pred = model.predict(X_val)
    
    # Calculer l'accuracy et arrondir à 3 décimales
    acc_C[C] = accuracy_score(y_val, y_pred)

# Afficher les résultats
for C, acc in acc_C.items():
    print(f"C={C:6} | Validation accuracy = {acc}")

# Trouver la meilleure C
best_acc = max(acc_C.values())
best_C = min([C for C, acc in acc_C.items() if acc == best_acc])  # plus petit C si plusieurs options
print("\nBest C:", best_C, "| Best validation accuracy:", best_acc)


C=  0.01 | Validation accuracy = 0.6996587030716723
C=   0.1 | Validation accuracy = 0.6996587030716723
C=     1 | Validation accuracy = 0.6996587030716723
C=    10 | Validation accuracy = 0.6996587030716723
C=   100 | Validation accuracy = 0.6996587030716723

Best C: 0.01 | Best validation accuracy: 0.6996587030716723
