### Logistic Regression

`adults.csv` dataset

In [26]:
import pandas as pd
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [27]:
data = pd.read_csv("data/adult.csv.zip")

In [28]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [30]:
# arrange X and y
X = data.drop(columns=["income"])
y = data["income"] #our target

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=99) #30% testing // 70% training

# fing categorical columns TO DO OHE
X_train_cat = X_train.select_dtypes("O")
X_train_cat 

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,gender,native-country
5765,Self-emp-inc,Prof-school,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States
2336,Private,Assoc-voc,Married-civ-spouse,Sales,Husband,White,Male,United-States
22156,Self-emp-not-inc,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
38574,Self-emp-not-inc,Bachelors,Married-civ-spouse,Farming-fishing,Husband,White,Male,United-States
43755,Private,Assoc-acdm,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States
...,...,...,...,...,...,...,...,...
42697,?,10th,Never-married,?,Own-child,White,Male,United-States
36008,Private,Bachelors,Never-married,Sales,Not-in-family,White,Male,?
46265,State-gov,Masters,Never-married,Tech-support,Not-in-family,White,Male,United-States
23587,Local-gov,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States


In [31]:
# apply OHE to categorical columns
ohe = OneHotEncoder(sparse_output=False)
cat_data_ohe = ohe.fit_transform(X_train_cat)
cat_data_ohe = pd.DataFrame(cat_data_ohe, columns=ohe.get_feature_names_out())

# join OHEed columns to the original X matrix
X_train_full = pd.concat([X_train.reset_index(drop=True), cat_data_ohe], axis=1)
X_train_full = X_train_full.drop(columns=X_train_cat.columns)

##  Build the model again--> I include cross validation in the training process using KFold cross validation

In [32]:
from sklearn.model_selection import cross_validate

In [33]:
lr = LogisticRegression()

In [34]:
results = cross_validate(
    estimator=lr, #THE MODEL HERE!
    X=X_train_full, #the one with OHE!!
    y=y_train,
    cv=5,
    n_jobs=-1,
    return_train_score=True
)

In [35]:
results #from array to a DF

{'fit_time': array([0.47555208, 0.47362018, 0.56044292, 0.54690123, 0.54160523]),
 'score_time': array([0.02323008, 0.02882719, 0.02495885, 0.02368522, 0.02407217]),
 'test_score': array([0.80330506, 0.79189822, 0.79379936, 0.79730915, 0.79757203]),
 'train_score': array([0.79481555, 0.79821579, 0.79795986, 0.79649739, 0.79650483])}

In [36]:
results = pd.DataFrame(results)
results

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.475552,0.02323,0.803305,0.794816
1,0.47362,0.028827,0.791898,0.798216
2,0.560443,0.024959,0.793799,0.79796
3,0.546901,0.023685,0.797309,0.796497
4,0.541605,0.024072,0.797572,0.796505


In [37]:
results = cross_validate(
    estimator=lr,
    X=X_train_full,
    y=y_train,
    cv=5, #5 folds!
    scoring=["precision","recall","f1"],
    n_jobs=-1,
    return_train_score=True
)

In [38]:
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score

In [39]:
# In this case we have to build the scorers with "make_scorer" because of the "pos_label>50K". The vanilla scorer only accepts a "1" as "pos_label" !!!

metrics = {
    "precision": make_scorer(precision_score, pos_label=">50K"),
    "recall": make_scorer(recall_score, pos_label=">50K"),
    "f1": make_scorer(f1_score, pos_label=">50K"),
    "roc_auc": make_scorer(roc_auc_score, needs_proba=True)
}

The pos_label parameter is used to define which class should be considered as the positive class when calculating these metrics. In most binary classification problems, there are two classes: a positive class (usually labeled as 1) and a negative class (usually labeled as 0). However, there are cases where the positive class is labeled differently, such as ">50K" in your example.

The make_scorer function is used to create custom scoring functions for these metrics, especially when the default behavior doesn't match your specific problem. 

In [40]:
results = cross_validate(
    estimator=lr,
    X=X_train_full,
    y=y_train,
    cv=5,
    scoring=metrics, #PUT HERE THE METRICS!!!
    n_jobs=-1,
    return_train_score=True
)

In [41]:
results = pd.DataFrame(results)
results

Unnamed: 0,fit_time,score_time,test_precision,train_precision,test_recall,train_recall,test_f1,train_f1,test_roc_auc,train_roc_auc
0,1.078526,0.435277,0.75082,0.710941,0.277408,0.252915,0.40513,0.373101,0.584597,0.57505
1,1.149309,0.381448,0.685065,0.723967,0.255603,0.265334,0.372298,0.388341,0.572464,0.569718
2,1.271846,0.384601,0.711775,0.721879,0.245306,0.265334,0.364865,0.38804,0.561657,0.5785
3,1.214307,0.389257,0.721202,0.712936,0.26166,0.262911,0.384,0.384156,0.574348,0.573889
4,1.216376,0.382555,0.71521,0.711837,0.267879,0.264082,0.389771,0.385244,0.574689,0.571815


In [42]:
results.T

Unnamed: 0,0,1,2,3,4
fit_time,1.078526,1.149309,1.271846,1.214307,1.216376
score_time,0.435277,0.381448,0.384601,0.389257,0.382555
test_precision,0.75082,0.685065,0.711775,0.721202,0.71521
train_precision,0.710941,0.723967,0.721879,0.712936,0.711837
test_recall,0.277408,0.255603,0.245306,0.26166,0.267879
train_recall,0.252915,0.265334,0.265334,0.262911,0.264082
test_f1,0.40513,0.372298,0.364865,0.384,0.389771
train_f1,0.373101,0.388341,0.38804,0.384156,0.385244
test_roc_auc,0.584597,0.572464,0.561657,0.574348,0.574689
train_roc_auc,0.57505,0.569718,0.5785,0.573889,0.571815



In Python, T is often used as an attribute or method to transpose a data structure. Specifically, when you see .T used with an array or a matrix, it means to transpose the data, which involves swapping rows and columns.

Let's get the charts

In [43]:
px.bar(
    results,
    y=["test_precision","test_recall","test_f1","test_roc_auc"],
    barmode="group",
    labels={"value":"Score", "index":"Round"},
    template="none"
)

##  train  model with the entire training set and evaluate it with the test set

In [44]:
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score

In [45]:
lr.fit(X_train_full, y_train)

### Prepare the test set

In [46]:
# fing categorical columns
X_test_cat = X_test.select_dtypes("O")

# apply OHE to categorical columns
cat_data_ohe = ohe.transform(X_test_cat)
cat_data_ohe = pd.DataFrame(cat_data_ohe, columns=ohe.get_feature_names_out())

# join OHEed columns to the original X matrix
X_test_full = pd.concat([X_test.reset_index(drop=True), cat_data_ohe], axis=1)
X_test_full = X_test_full.drop(columns=X_test_cat.columns)

Use the model for prediction

In [47]:
pred = lr.predict(X_test_full)
probas = lr.predict_proba(X_test_full)

Calculate metrics on the test set

In [48]:
precision_test = precision_score(y_test, pred, pos_label=">50K")
recall_test = recall_score(y_test, pred, pos_label=">50K")
f1_test = f1_score(y_test, pred, pos_label=">50K")
roc_auc_test = roc_auc_score(y_test, probas[:,1])

In [49]:
print(f"Test Precision: {precision_test}")
print(f"Test Recall: {recall_test}")
print(f"Test F1: {f1_test}")
print(f"Test ROC_AUC: {roc_auc_test}")

Test Precision: 0.7074164629176855
Test Recall: 0.25284008156131665
Test F1: 0.37253218884120176
Test ROC_AUC: 0.5762631281890719


In [50]:
import joblib
joblib.dump(lr, "lr3.pkl")
my_model = joblib.load("lr3.pkl")