# Week 4 Checkin - Logistic Regression

In [10]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [11]:
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [12]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt


In [13]:
logistic_data_orig = pd.read_excel('../hannah/clean_data.xlsx')
logistic_data_orig

FileNotFoundError: [Errno 2] No such file or directory: 'clean_data.xlsx'

In [4]:
logistic_data_orig.sample(7)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
70478,86820,1wDTIaOD00oYLgQn37E1SC,Social Distortion,White Light White Heat White Trash,Through These Eyes,36,194093,False,0.424,0.923,...,-3.242,0,0.0373,5.6e-05,0.176,0.0546,0.742,94.977,4,punk
19782,21215,3MhdH8PxqH1FuQp3HBptUI,Sean Paul;Sasha;Jeremy Harding;Murray Elias,Dutty Rock,I'm Still in Love with You (feat. Sasha),71,273360,False,0.765,0.666,...,-5.384,1,0.172,0.102,0.0,0.116,0.756,87.002,4,dancehall
52256,61477,7ozdvdj2ap3UzD9LiqfH0l,Nogizaka46,走れ!Bicycle TypeA,走れ!Bicycle,22,222320,False,0.507,0.942,...,-2.124,1,0.0689,0.111,1e-06,0.246,0.635,165.074,4,j-idol
44724,50932,74cHpjY6ak43Crxn0LAzVD,OConnor,Yerba Mala Nunca Muere,La Sopa del Diablo,19,284373,False,0.573,0.754,...,-7.295,1,0.0248,7e-06,0.00525,0.279,0.309,112.007,4,heavy-metal
34237,38386,2AYlutp7ULT1xjhYo3ZO67,Los Diabólicos,Los Diabólicos,Muerte y Destrucción,30,108921,False,0.2,0.851,...,-5.757,1,0.053,0.011,0.977,0.888,0.381,92.95,4,garage
26870,28687,4Asr5wGaWknEpK9dKcSH5v,Horace Andy,Serious Times,Rastafari,28,234360,False,0.732,0.424,...,-10.875,1,0.208,0.163,0.0,0.338,0.582,83.699,4,dub
34705,38866,6lnnaGN20kl0jEYJSxCgU9,The Strokes,Room On Fire,You Talk Way Too Much,53,186266,False,0.304,0.971,...,-3.226,1,0.0351,0.0594,0.955,0.0799,0.695,127.856,4,garage



---


## Fitting logistic regression to the full training dataset


Next, compute a logistic regression fit for the entire training dataset:

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Separate the features and target variable
X = logistic_data_orig[["instrumentalness", "speechiness", "energy", "valence", "danceability", "acousticness"]]

y = (logistic_data_orig['popularity'] > 30).astype(int)  # Convert to binary: 1 if popularity > 33, else 0

# Apply LabelEncoder to each categorical column in X
for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Split data into training, validation, and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the logistic regression model with binary target
lr_all = LogisticRegression(solver='liblinear')
lr_all.fit(X_train, y_train)

# View intercept and coefficients
print("Intercept:", lr_all.intercept_)
print("Coefficients:", lr_all.coef_)




Intercept: [0.17933063]
Coefficients: [[-0.96508481 -2.19179382 -0.10068603 -0.76492574  1.34050641 -0.04720315]]


In [6]:
# Randomly select 50 sample indices from X_val
sample_indices = np.random.choice(X_val.index, size=50, replace=False)

# Select samples from X_val and y_val using the random indices
X_val_sample = X_val.loc[sample_indices, :]
y_val_sample = y_val.loc[sample_indices]

# Predict probabilities and classes for the sample
y_val_pred_probs = lr_all.predict_proba(X_val_sample)[:, 1]  # Probability for the positive class
y_val_pred_classes = lr_all.predict(X_val_sample)

# Create a DataFrame to display actual vs. predicted
evaluation_df = pd.DataFrame({
    'Is popular': y_val_sample,
    'Predicted Probability': y_val_pred_probs,
    'Predicted Popularity': y_val_pred_classes
})

# Display the evaluation DataFrame
print(evaluation_df)


       Is popular  Predicted Probability  Predicted Popularity
21651           1               0.602443                     1
69414           0               0.522899                     1
3716            1               0.475896                     0
86373           1               0.426351                     0
2844            1               0.596442                     1
14051           0               0.640599                     1
45711           0               0.570921                     1
36385           0               0.620737                     1
77980           0               0.594682                     1
45569           0               0.574490                     1
21428           0               0.546410                     1
44218           0               0.525007                     1
20285           1               0.528660                     1
77026           1               0.674518                     1
3715            1               0.394899               

In [7]:
y_val_pred = lr_all.predict(X_val)
conf_lr = metrics.confusion_matrix(y_true=y_val, y_pred=y_val_pred)
conf_lr

array([[2943, 5169],
       [2195, 7440]])

Prediction accuracy:

In [18]:
# Logistic regression
print((conf_lr[0, 0] + conf_lr[1, 1]) / conf_lr.sum())

0.5850566292894573


In [19]:
# true positive rate
tpr= (conf_lr[1,1])/conf_lr[1,:].sum()
# true negative rate
tnr= (conf_lr[0,0])/conf_lr[0,:].sum()

print("True Positive Rate:",tpr)
print("True Negative Rate:",tnr)

True Positive Rate: 0.772184743124027
True Negative Rate: 0.36279585798816566


***Predicted probability densities***


In [9]:
px.histogram(evaluation_df, x='Predicted Probability', color='Is popular',
             nbins=20, opacity=0.5,
             barmode='overlay')



***ROC Curve***

In [10]:
lr_fpr_sample, lr_tpr_sample, lr_thresholds_sample = metrics.roc_curve(evaluation_df['Is popular'], evaluation_df['Predicted Probability'])

lr_thresholds_sample


array([       inf, 0.6888668 , 0.65112905, 0.64964232, 0.64419215,
       0.64059887, 0.62301651, 0.62073686, 0.59602678, 0.59468227,
       0.58608228, 0.57092076, 0.551005  , 0.54259074, 0.52765725,
       0.51820062, 0.51549355, 0.49625784, 0.47589613, 0.43125544,
       0.42635092, 0.4079434 , 0.39489945, 0.28273001, 0.19259143])

In [11]:
roc_lr_sample = pd.DataFrame({
    'False Positive Rate': lr_fpr_sample,
    'True Positive Rate': lr_tpr_sample,
    'Model': 'Logistic Regression'
}, index=lr_thresholds_sample)


roc_sample_df = pd.concat([roc_lr_sample])


px.line(roc_sample_df, y='True Positive Rate', x='False Positive Rate',
        color='Model',
        width=700, height=500
)

In [12]:
# Logistic regression
lr_auc_sample = metrics.roc_auc_score(evaluation_df['Is popular'], evaluation_df['Predicted Probability'])
print('Logistic regression AUC:', lr_auc_sample.round(3))

Logistic regression AUC: 0.701


***USING CV FOR MODEL PERFORMANCE***

In [13]:
from sklearn.model_selection import cross_val_score


# Separate the features and target variable
X = logistic_data_orig[["instrumentalness", "speechiness", "energy", "valence", "danceability", "acousticness"]]

y = (logistic_data_orig['popularity'] > 30).astype(int)  # Convert to binary: 1 if popularity > 33, else 0

# This does stratified Kfolds for us...
cross_val_score(lr_all, X, y, cv=5, scoring='roc_auc')

array([0.61194458, 0.61632333, 0.62607415, 0.53816304, 0.48793798])

In [14]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

# Use the shuffle and random state if want data shuffled before splitting
#skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skfolds = StratifiedKFold(n_splits=5)
i = 1
for train_index, test_index in skfolds.split(X, y):
    clone_lr = clone(lr_all)
    X_train_folds = X.iloc[train_index]
    y_train_folds = y.iloc[train_index]
    X_test_fold = X.iloc[test_index]
    print(test_index)
    clone_lr.fit(X_train_folds, y_train_folds)
    y_pred = clone_lr.predict(X_test_fold)

    auc_sample = metrics.roc_auc_score(y.iloc[test_index], y_pred)
    print('Fold: ', i)
    print('AUC: ', auc_sample)
    print('Accuracy: ', metrics.accuracy_score(y.iloc[test_index], y_pred))
    i += 1

[    0     1     2 ... 19686 19687 19689]
Fold:  1
AUC:  0.5793129726060839
Accuracy:  0.5916492928382262
[17120 17121 17122 ... 36795 36796 36797]
Fold:  2
AUC:  0.583511244232712
Accuracy:  0.599199864765876
[34994 34995 34996 ... 57555 57556 57557]
Fold:  3
AUC:  0.5884538204495349
Accuracy:  0.6043051955370224
[50360 50361 50362 ... 73050 73056 73057]
Fold:  4
AUC:  0.5262817018206203
Accuracy:  0.543615462639468
[67736 67737 67738 ... 88729 88730 88731]
Fold:  5
AUC:  0.4887287537481254
Accuracy:  0.5067620872309253


In [5]:
# Side note: forward selection code 
from sklearn.feature_selection import SequentialFeatureSelector

# Only look at a subset of the data to speed up the computation
logistic_data_train_sub_X = X_train.iloc[:1000]
logistic_data_train_sub_y = y_train.iloc[:1000]

selector = SequentialFeatureSelector(
    lr_all,
    n_features_to_select=4,
    direction='forward',
    scoring='neg_mean_squared_error',
    cv = 5
)

selector.fit(X=logistic_data_train_sub_X.drop(columns="track_genre"), y=logistic_data_train_sub_y["track_genre"])
selector.get_feature_names_out()

NameError: name 'X_train' is not defined