In [1]:
import pandas as pd

tf_mini_path = 'tf_mini.csv'
log_mini_path = 'log_mini.csv'

tf_mini = pd.read_csv(tf_mini_path)
log_mini = pd.read_csv(log_mini_path)

tf_mini_head = tf_mini.head()
log_mini_head = log_mini.head()

tf_mini_head, log_mini_head

(                                 track_id    duration  release_year  \
 0  t_a540e552-16d4-42f8-a185-232bd650ea7d  109.706673          1950   
 1  t_67965da0-132b-4b1e-8a69-0ef99b32287c  187.693329          1950   
 2  t_0614ecd3-a7d5-40a1-816e-156d5872a467  160.839996          1951   
 3  t_070a63a0-744a-434e-9913-a97b02926a29  175.399994          1951   
 4  t_d6990e17-9c31-4b01-8559-47d9ce476df1  369.600006          1951   
 
    us_popularity_estimate  acousticness  beat_strength  bounciness  \
 0               99.975414      0.458040       0.519497    0.504949   
 1               99.969430      0.916272       0.419223    0.545530   
 2               99.602549      0.812884       0.425890    0.508280   
 3               99.665018      0.396854       0.400934    0.359990   
 4               99.991764      0.728831       0.371328    0.335115   
 
    danceability  dyn_range_mean    energy  ...  time_signature   valence  \
 0      0.399767        7.511880  0.817709  ...              

In [2]:
merged_data = pd.merge(log_mini, tf_mini, left_on='track_id_clean', right_on='track_id')

merged_data_head = merged_data.head()

missing_values = merged_data.isnull().sum()

merged_data_head, missing_values

(                               session_id  session_position  session_length  \
 0  0_00006f66-33e5-4de7-a324-2d18e439fc1e                 1              20   
 1  0_00079a23-1600-486a-91bd-5208be0c745a                 7              12   
 2  0_012b0fb4-0cc3-429f-9a78-cc6e622153fb                 6              20   
 3  0_013cc010-c476-4ad2-8972-73449e0b2ef4                 9              13   
 4  0_01a5f0dc-9938-48c9-92f1-c7e51f34d290                 7              12   
 
                            track_id_clean  skip_1  skip_2  skip_3  \
 0  t_0479f24c-27d2-46d6-a00c-7ec928f2b539   False   False   False   
 1  t_0479f24c-27d2-46d6-a00c-7ec928f2b539   False   False    True   
 2  t_0479f24c-27d2-46d6-a00c-7ec928f2b539   False   False   False   
 3  t_0479f24c-27d2-46d6-a00c-7ec928f2b539   False   False   False   
 4  t_0479f24c-27d2-46d6-a00c-7ec928f2b539   False   False    True   
 
    not_skipped  context_switch  no_pause_before_play  ...  time_signature  \
 0         True   

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

features = merged_data.drop(['skip_1', 'skip_2', 'skip_3', 'not_skipped', 'track_id_clean', 'session_id', 'track_id', 'date'], axis=1)
target = merged_data['skip_1']

features = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

random_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42)
random_forest_model.fit(X_train, y_train)

y_pred = random_forest_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

print(accuracy)
print(classification_report_result)

importances = random_forest_model.feature_importances_

feature_names = features.columns
feature_importances = pd.DataFrame(importances, index=feature_names, columns=["importance"]).sort_values("importance", ascending=False)

print(feature_importances.head(10))

features = merged_data.drop(['skip_1', 'skip_2', 'skip_3', 'not_skipped', 'track_id_clean', 'session_id', 'track_id', 'date'], axis=1)
target = merged_data['skip_2']

features = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

random_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42)
random_forest_model.fit(X_train, y_train)

y_pred = random_forest_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

print(accuracy)
print(classification_report_result)

importances = random_forest_model.feature_importances_

feature_names = features.columns
feature_importances = pd.DataFrame(importances, index=feature_names, columns=["importance"]).sort_values("importance", ascending=False)

print(feature_importances.head(10))

features = merged_data.drop(['skip_1', 'skip_2', 'skip_3', 'not_skipped', 'track_id_clean', 'session_id', 'track_id', 'date'], axis=1)
target = merged_data['skip_3']

features = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

random_forest_model = RandomForestClassifier(n_estimators = 100, random_state = 42)
random_forest_model.fit(X_train, y_train)

y_pred = random_forest_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

print(accuracy)
print(classification_report_result)

importances = random_forest_model.feature_importances_

feature_names = features.columns
feature_importances = pd.DataFrame(importances, index=feature_names, columns=["importance"]).sort_values("importance", ascending=False)

print(feature_importances.head(10))

0.8779783178460805
              precision    recall  f1-score   support

       False       0.91      0.88      0.89     19424
        True       0.84      0.88      0.86     14152

    accuracy                           0.88     33576
   macro avg       0.87      0.88      0.88     33576
weighted avg       0.88      0.88      0.88     33576

                                           importance
hist_user_behavior_reason_end_trackdone      0.186029
hist_user_behavior_reason_start_trackdone    0.114909
hist_user_behavior_reason_end_fwdbtn         0.105944
hist_user_behavior_reason_start_fwdbtn       0.079070
hist_user_behavior_reason_end_backbtn        0.040902
session_position                             0.029611
hour_of_day                                  0.028854
hist_user_behavior_reason_end_endplay        0.017806
hist_user_behavior_reason_start_backbtn      0.015967
session_length                               0.013750
0.8732725756492733
              precision    recall  f1-sco

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

features = merged_data.drop(['skip_1', 'skip_2', 'skip_3', 'not_skipped', 'track_id_clean', 'session_id', 'track_id', 'date'], axis=1)
target = merged_data['skip_1']

features = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

gradient_boosting_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gradient_boosting_model.fit(X_train, y_train)

y_pred_gb = gradient_boosting_model.predict(X_test)

accuracy_gb = accuracy_score(y_test, y_pred_gb)
classification_report_gb = classification_report(y_test, y_pred_gb)

print("Accuracy:", accuracy_gb)
print("Classification Report:\n", classification_report_gb)

importances = gradient_boosting_model.feature_importances_

feature_names = features.columns
feature_importances = pd.DataFrame(importances, index=feature_names, columns=["importance"]).sort_values("importance", ascending=False)

print(feature_importances.head(10))

from sklearn.ensemble import GradientBoostingClassifier

features = merged_data.drop(['skip_1', 'skip_2', 'skip_3', 'not_skipped', 'track_id_clean', 'session_id', 'track_id', 'date'], axis=1)
target = merged_data['skip_2']

features = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

gradient_boosting_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gradient_boosting_model.fit(X_train, y_train)

y_pred_gb = gradient_boosting_model.predict(X_test)

accuracy_gb = accuracy_score(y_test, y_pred_gb)
classification_report_gb = classification_report(y_test, y_pred_gb)

print("Accuracy:", accuracy_gb)
print("Classification Report:\n", classification_report_gb)

importances = gradient_boosting_model.feature_importances_

feature_names = features.columns
feature_importances = pd.DataFrame(importances, index=feature_names, columns=["importance"]).sort_values("importance", ascending=False)

print(feature_importances.head(10))

from sklearn.ensemble import GradientBoostingClassifier

features = merged_data.drop(['skip_1', 'skip_2', 'skip_3', 'not_skipped', 'track_id_clean', 'session_id', 'track_id', 'date'], axis=1)
target = merged_data['skip_3']

features = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

gradient_boosting_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gradient_boosting_model.fit(X_train, y_train)

y_pred_gb = gradient_boosting_model.predict(X_test)

accuracy_gb = accuracy_score(y_test, y_pred_gb)
classification_report_gb = classification_report(y_test, y_pred_gb)

print("Accuracy:", accuracy_gb)
print("Classification Report:\n", classification_report_gb)

importances = gradient_boosting_model.feature_importances_

feature_names = features.columns
feature_importances = pd.DataFrame(importances, index=feature_names, columns=["importance"]).sort_values("importance", ascending=False)

print(feature_importances.head(10))

Accuracy: 0.8812246842983084
Classification Report:
               precision    recall  f1-score   support

       False       0.92      0.87      0.89     19424
        True       0.84      0.89      0.86     14152

    accuracy                           0.88     33576
   macro avg       0.88      0.88      0.88     33576
weighted avg       0.88      0.88      0.88     33576

                                           importance
hist_user_behavior_reason_end_trackdone      0.595709
hist_user_behavior_reason_start_trackdone    0.214098
hist_user_behavior_reason_end_endplay        0.073276
hist_user_behavior_n_seekback                0.022389
hist_user_behavior_reason_end_backbtn        0.021545
hist_user_behavior_reason_start_fwdbtn       0.019423
no_pause_before_play                         0.014522
hist_user_behavior_reason_start_clickrow     0.013683
hist_user_behavior_n_seekfwd                 0.009766
hist_user_behavior_reason_end_fwdbtn         0.006536
Accuracy: 0.87532761496306

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

features = merged_data.drop(['skip_1', 'skip_2', 'skip_3', 'not_skipped', 'track_id_clean', 'session_id', 'track_id', 'date'], axis=1)
target = merged_data['skip_1']

features = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

logistic_regression_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_regression_model.fit(X_train, y_train)

y_pred_proba_lr = logistic_regression_model.predict_proba(X_test)[:, 1]

auc_roc_lr = roc_auc_score(y_test, y_pred_proba_lr)

print("AUC-ROC Score:", auc_roc_lr)

coefficients = logistic_regression_model.coef_[0]  
feature_names = features.columns  

feature_importances_lr = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

feature_importances_lr = feature_importances_lr.reindex(feature_importances_lr.Coefficient.abs().sort_values(ascending=False).index)

print(feature_importances_lr)

features = merged_data.drop(['skip_1', 'skip_2', 'skip_3', 'not_skipped', 'track_id_clean', 'session_id', 'track_id', 'date'], axis=1)
target = merged_data['skip_2']

features = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

logistic_regression_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_regression_model.fit(X_train, y_train)

y_pred_proba_lr = logistic_regression_model.predict_proba(X_test)[:, 1]

auc_roc_lr = roc_auc_score(y_test, y_pred_proba_lr)

print("AUC-ROC Score:", auc_roc_lr)

coefficients = logistic_regression_model.coef_[0]  
feature_names = features.columns  

feature_importances_lr = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

feature_importances_lr = feature_importances_lr.reindex(feature_importances_lr.Coefficient.abs().sort_values(ascending=False).index)

print(feature_importances_lr)

features = merged_data.drop(['skip_1', 'skip_2', 'skip_3', 'not_skipped', 'track_id_clean', 'session_id', 'track_id', 'date'], axis=1)
target = merged_data['skip_3']

features = pd.get_dummies(features)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

logistic_regression_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_regression_model.fit(X_train, y_train)

y_pred_proba_lr = logistic_regression_model.predict_proba(X_test)[:, 1]

auc_roc_lr = roc_auc_score(y_test, y_pred_proba_lr)

print("AUC-ROC Score:", auc_roc_lr)

coefficients = logistic_regression_model.coef_[0]  
feature_names = features.columns  

feature_importances_lr = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

feature_importances_lr = feature_importances_lr.reindex(feature_importances_lr.Coefficient.abs().sort_values(ascending=False).index)

print(feature_importances_lr)

AUC-ROC Score: 0.9388417224429889
                                      Feature  Coefficient
60    hist_user_behavior_reason_end_trackdone    -2.091337
7               hist_user_behavior_n_seekback    -1.444635
54      hist_user_behavior_reason_end_backbtn     1.249476
57       hist_user_behavior_reason_end_fwdbtn     1.158101
52  hist_user_behavior_reason_start_trackdone    -0.862700
..                                        ...          ...
62                                 mode_minor     0.003636
28                                      tempo    -0.003218
27                                speechiness    -0.002671
43                         context_type_radio     0.000612
26                                   organism    -0.000145

[63 rows x 2 columns]
AUC-ROC Score: 0.9445686147632921
                                       Feature  Coefficient
60     hist_user_behavior_reason_end_trackdone    -2.081464
57        hist_user_behavior_reason_end_fwdbtn     1.120314
54       hist_user_be