In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from catboost import CatBoostClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../input/song-popularity-folds/train_folds.csv")
df_test = pd.read_csv("../input/song-popularity-prediction/test.csv")
sample_submission = pd.read_csv("../input/song-popularity-prediction/sample_submission.csv")

In [3]:
df.head()

Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,song_popularity,kfold
0,9847,132499.0,0.577092,0.460796,0.45078,0.005078,0.0,0.166867,-13.361975,1,0.022589,93.467197,3,0.879988,0,0
1,17422,171966.0,0.117289,0.638875,0.60189,,7.0,0.134333,-7.038682,1,0.058911,85.637958,3,0.743561,1,0
2,23004,229096.0,0.066573,0.283908,0.948761,0.003192,5.0,0.141574,-4.944055,0,0.050572,183.835886,4,0.809488,1,0
3,32546,158198.0,0.004572,0.729196,0.85585,0.001077,1.0,0.167778,-4.74753,0,0.189456,175.64938,4,0.742967,1,0
4,21743,,0.011991,0.352736,0.497838,0.105577,9.0,0.109678,,0,0.026269,87.55265,3,0.096049,1,0


In [4]:
useful_features = [columns for columns in df.columns if columns not in ("id", "song_popularity", "kfold")]
object_cols = ['key', 'audio_mode', 'time_signature']
numerical_cols = ['song_duration_ms', 'acousticness', 'danceability', 'energy','instrumentalness', 'liveness', 'loudness',
                  'speechiness', 'tempo', 'audio_valence']

df_test = df_test[useful_features]

#columns_to_impute = [col for col in useful_features if df[col].isnull().sum() > 0]

In [5]:
final_test_predictions = []
for fold in range(5):
    X_train = df[df.kfold != fold].reset_index(drop=True)
    X_valid = df[df.kfold == fold].reset_index(drop=True)
    X_test = df_test.copy()

    y_train = X_train["song_popularity"]
    y_valid = X_valid["song_popularity"]

    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]

    imputer_num = SimpleImputer(strategy="median")
    X_train[numerical_cols] = imputer_num.fit_transform(X_train[numerical_cols])
    X_valid[numerical_cols] = imputer_num.transform(X_valid[numerical_cols])
    X_test[numerical_cols] = imputer_num.transform(X_test[numerical_cols])
    
    imputer_obj = SimpleImputer(strategy="most_frequent")
    X_train[object_cols] = imputer_obj.fit_transform(X_train[object_cols])
    X_valid[object_cols] = imputer_obj.transform(X_valid[object_cols])
    X_test[object_cols] = imputer_obj.transform(X_test[object_cols])

    scaler = preprocessing.RobustScaler()
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
    X_valid[numerical_cols] = scaler.transform(X_valid[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

    ordinal_encoder = preprocessing.OrdinalEncoder()
    X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
    X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])
    X_test[object_cols] = ordinal_encoder.transform(X_test[object_cols])
    
    sample = RandomOverSampler(random_state=fold)
    X_sample, y_sample = sample.fit_resample(X_train, y_train)

    model = CatBoostClassifier( random_state=fold,                               
                                task_type="GPU", 
                                eval_metric="AUC",
                                verbose=1000)

    model.fit(X_sample, y_sample)
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    
    final_test_predictions.append(test_preds)

    print("accuracy is:",accuracy_score(y_valid, preds_valid))
    print(classification_report(y_valid, preds_valid))

Learning rate set to 0.027504
0:	learn: 0.5616230	total: 57.2ms	remaining: 57.1s
999:	learn: 0.7277157	total: 47.7s	remaining: 0us
accuracy is: 0.553375
              precision    recall  f1-score   support

           0       0.69      0.56      0.62      5195
           1       0.40      0.54      0.46      2805

    accuracy                           0.55      8000
   macro avg       0.55      0.55      0.54      8000
weighted avg       0.59      0.55      0.56      8000

Learning rate set to 0.027493
0:	learn: 0.5607895	total: 53.9ms	remaining: 53.8s
999:	learn: 0.7267221	total: 48.4s	remaining: 0us
accuracy is: 0.54825
              precision    recall  f1-score   support

           0       0.67      0.57      0.61      5058
           1       0.41      0.52      0.46      2942

    accuracy                           0.55      8000
   macro avg       0.54      0.54      0.53      8000
weighted avg       0.57      0.55      0.56      8000

Learning rate set to 0.02749
0:	learn: 0.

In [6]:
threshold = 0.52
predictions = (model.predict_proba(X_valid)[:, 1] >= threshold).astype(int)

print('Valuation for test data only:')
print(classification_report(y_valid, preds_valid))
print("----------------------------------------------------------------------")
print('Valuation for test data only  (new_threshold):')
print(classification_report(y_valid, predictions))

Valuation for test data only:
              precision    recall  f1-score   support

           0       0.68      0.57      0.62      5078
           1       0.41      0.53      0.47      2922

    accuracy                           0.55      8000
   macro avg       0.55      0.55      0.54      8000
weighted avg       0.58      0.55      0.56      8000

----------------------------------------------------------------------
Valuation for test data only  (new_threshold):
              precision    recall  f1-score   support

           0       0.67      0.66      0.66      5078
           1       0.42      0.42      0.42      2922

    accuracy                           0.57      8000
   macro avg       0.54      0.54      0.54      8000
weighted avg       0.58      0.57      0.57      8000



In [7]:
f_list = []
predictions_test = (model.predict_proba(X_test)[:, 1] >= threshold).astype(int)
f_list.append(predictions_test)
preds = np.column_stack(f_list)

In [8]:
sample_submission["song_popularity"] = preds
sample_submission.to_csv("submission.csv", index=False)