#Importo librerie

In [1]:
!pip install smogn

Collecting smogn
  Downloading https://files.pythonhosted.org/packages/fe/a6/a3f78f5e2e18fa302fd23a32019908657113faa917463a9d49cbe9d20625/smogn-0.1.2-py3-none-any.whl
Installing collected packages: smogn
Successfully installed smogn-0.1.2


In [2]:
import pandas as pd
import numpy as np
import itertools
import concurrent.futures

import matplotlib.pyplot as plt
import seaborn as sns

from time import time

from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, mean_squared_error, r2_score, precision_recall_fscore_support
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.utils import shuffle
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

import smogn



In [3]:
import warnings
from sklearn.exceptions import FitFailedWarning, ConvergenceWarning
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=FitFailedWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

#Definizione funzioni

In [4]:
# grafico matrice di confusione

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [5]:
# stampa tempo trascorso

def print_exec_time(start):
  print("\nEsecuzione completata in %.2f secondi" % (time.perf_counter()-start))

In [6]:
# funzione per creare un sotto-df con le classi target bilanciate, in modo che anche le classi della variabile specificata nel parametro col siano egualmente rappresentate in ciascuna classe target

def undersample(df, label, col=None):
  hits = df[df[label] == 1]
  non_hits = df[df[label] == 0]

  if(col != None):

    non_hits_sampled_array = []

    for a in non_hits[col].unique():
      n = hits[hits[col] == a].id.count()
      non_hits_sampled_array.append(non_hits[non_hits[col] == a].sample(n))

    non_hits_sampled = pd.concat(non_hits_sampled_array)

  else:
    non_hits_sampled = df[df[label] == 0].sample(hits.shape[0])

  return pd.concat([hits,non_hits_sampled])

In [7]:
def over_under_balancing(X, Y, oversample_ratio, oversample_algorithm):
  '''
  oversample_algorithm --> [SMOTE, BorderlineSMOTE, SVMSMOTE, ADASYN]
  '''
  over = oversample_algorithm(sampling_strategy=oversample_ratio) # --> genero nuovi esempi nella classe in minoranza (hit) in modo da portarla al (es.) 50% della classe maggioritaria (non-hit)
  under = RandomUnderSampler(sampling_strategy=1) # --> rimuovo casualmente esempi dalla classe maggioritaria (non-hit) fino a portarla al 100% della classe minoritaria (hit)
  steps = [('o', over),('u', under)]
  pipeline = Pipeline(steps=steps)

  X, Y = pipeline.fit_resample(X, Y)
  
  return X, Y

In [8]:
# new_features_params
def insert_new_features_params(row, new_features_params):
    if(row.features == 'standard features'):
        return None
    else:
        return new_features_params

In [9]:
# selezionare il numero di componenti principali per LDA

def select_n_components(var_ratio, goal_var: float) -> int:
    # Set initial variance explained so far
    total_variance = 0.0
    
    # Set initial number of features
    n_components = 0
    
    # For the explained variance of each feature:
    for explained_variance in var_ratio:
        
        # Add the explained variance to the total
        total_variance += explained_variance
        
        # Add one to the number of components
        n_components += 1
        
        # If we reach our goal level of explained variance
        if total_variance >= goal_var:
            # End the loop
            break
            
    # Return the number of components
    return n_components

In [10]:
def best_ml_model(ml_model, params, df, features, oversample_algorithm, test_size):

    print('training ' + str(ml_model.__name__) + '...')
    time_0 = time()

    # creo i 3 'gruppi' di modelli ML

    '''
    A)
      'month' --> (cos,sin)
      target --> 'hit'
      oversampling_algorithm --> SMOTE
    '''

    A = [LogisticRegression, LinearSVC, SVC, KNeighborsClassifier]

    '''
    B)
      'month' --> int
      target --> 'hit'
      oversampling_algorithm --> SMOTE
    ''' 

    B = [DecisionTreeClassifier, RandomForestClassifier]

    '''
    C)
      'month' --> (cos,sin)
      target --> 'weeks_enc'
      oversampling_algorithm --> SMOGN
    '''

    C = [LinearRegression, ElasticNet]

    # creo array numpy
    if(ml_model in A):   # A)
        X = df[features].drop(['month'], axis=1).values
        Y = df['hit'].values

        problem = 'classification'

    elif(ml_model in B):   # B)
        X = df[features].drop(['sin(month)', 'cos(month)'], axis=1).values
        Y = df['hit'].values

        problem = 'classification'

    elif(ml_model in C): # C)
        # bilancio dataset
        df = undersample(df, 'hit', 'year_YYYY')
        
        X = df[features].drop(['month'], axis=1).values
        Y = df['weeks_enc'].values

        problem = 'regression'

    # creo set addestramento e test
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size)

    # bilancio dataset

    if(ml_model in A+B):

      class_balancing = oversample_algorithm.__name__ + ' + random undersampling'

      # bilancio train set
      X_train, Y_train = over_under_balancing(X_train, Y_train, oversample_ratio=0.6, oversample_algorithm=oversample_algorithm)

      # bilancio test set
      X_test, Y_test = RandomUnderSampler(sampling_strategy=1).fit_resample(X_test, Y_test)

      # LDA
      lda = LDA(n_components=None)
      X_lda = lda.fit(X_train, Y_train)

      lda_var_ratios = lda.explained_variance_ratio_

      n_components = select_n_components(lda_var_ratios, 0.95)

      lda = LDA(n_components=n_components)

      X_train = lda.fit_transform(X_train, Y_train)
      X_test = lda.transform(X_test)
    
    elif(ml_model in C):

      class_balancing = 'year-weighted undersample'

      ''' 
      # bilancio train set
      # NB: SMOGN richiede in input un dataframe, quindi ricreo il train set come dataframe
      columns = list(df[features].drop('month', axis=1).columns)+['weeks_enc']  # definisco colonne del train_set_df
      train_set_df = pd.DataFrame(np.append(X_train,Y_train.reshape(Y_train.shape[0],1), axis=1), columns=columns) # inserisco train set in un dataframe da dare come input a SMOGN
      train_set = smogn.smoter(data=train_set_df, y='weeks_enc', rel_method='manual', rel_coef=1, rel_ctrl_pts_rg=[[0.5,1,0],[0,0,0]])  # applico oversampling SMOGN /// rel_method='manual', rel_coef=1, rel_ctrl_pts_rg=[[0.5,1,0],[0,0,0]]

      X_train = train_set.drop('weeks_enc', axis=1).values
      Y_train = train_set['weeks_enc'].values
      '''

    # istanzio classe modello
    model = ml_model(**params)

    # addestro modello con random search
    model.fit(X_train,Y_train)

    # METRICHE

    if(ml_model in A+B):
        # test set
        Y_pred = model.predict(X_test) # effettuo predizioni
        Y_pred_proba = model.predict_proba(X_test) # calcolo probabilità predizioni

        # calcolo metriche
        accuracy = accuracy_score(Y_test, Y_pred)
        loss = log_loss(Y_test,Y_pred_proba)

        conf_matrix = confusion_matrix(Y_test,Y_pred)

        precision, recall, fscore, support = precision_recall_fscore_support(Y_test, Y_pred)

        # train set
        Y_pred_train = model.predict(X_train)  # effettuo predizioni (train set)
        Y_pred_proba_train = model.predict_proba(X_train)  # calcolo probabilità predizioni (train test)

        # calcolo metriche
        accuracy_train = accuracy_score(Y_train, Y_pred_train)
        loss_train = log_loss(Y_train,Y_pred_proba_train)

        conf_matrix_train = confusion_matrix(Y_train,Y_pred_train)

        precision_train, recall_train, fscore_train, support_train = precision_recall_fscore_support(Y_train, Y_pred_train)

        # imposto a None le metriche della regressione
        MSE = None
        r2 = None
        MSE_train = None
        r2_train = None

    elif(ml_model in C):
        # test set
        Y_pred = model.predict(X_test) # effettuo predizioni

        # calcolo metriche
        MSE = mean_squared_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        # train set
        Y_pred_train = model.predict(X_train) # effettuo predizioni

        # calcolo metriche
        MSE_train = mean_squared_error(Y_train, Y_pred_train)
        r2_train = r2_score(Y_train, Y_pred_train)

        # imposto a None le metriche della classificazione
        accuracy = None
        loss = None
        precision = [None, None]
        recall = [None, None]
        fscore = [None, None]
        conf_matrix = None
        accuracy_train = None
        loss_train = None
        precision_train = [None, None]
        recall_train = [None, None]
        fscore_train = [None, None]
        conf_matrix_train = None

    tot_time = round((time() - time_0), 2)

    result_df = pd.DataFrame(
              [[ml_model.__name__,
              params,
              class_balancing,
              problem,
              accuracy,
              loss,
              conf_matrix,
              precision[0],
              precision[1],
              recall[0],
              recall[1],
              fscore[0],
              fscore[1],
              accuracy_train,
              loss_train,
              conf_matrix_train,
              precision_train[0],
              precision_train[1],
              recall_train[0],
              recall_train[1],
              fscore_train[0],
              fscore_train[1],
              MSE,
              r2,
              MSE_train,
              r2_train,
              tot_time]],
        columns=['algorithm',
                 'parameters',
                 'class_balancing',
                 'problem',
                 'test_accuracy',
                 'test_log_loss',
                 'test_confusion_matrix',
                 'test_precision_0',
                 'test_precision_1', 
                 'test_recall_0',
                 'test_recall_1',
                 'test_fscore_0',
                 'test_fscore_1',
                 'train_accuracy',
                 'train_log_loss',
                 'train_confusion_matrix',
                 'train_precision_0',
                 'train_precision_1',
                 'train_recall_0',
                 'train_recall_1',
                 'train_fscore_0',
                 'train_fscore_1',
                 'test_MSE',
                 'test_r2',
                 'train_MSE',
                 'train_r2',
                 'exec_time'])

    
    print('...training completed for ' + str(ml_model.__name__) + ' in ' + str(tot_time) + ' seconds')

    return result_df

In [11]:
def exec_per_year_range(year_range, df, category, oversample_algorithm, new_features_params):
  year_start = year_range[0]
  year_end = year_range[1]

  # regolo dimensione test set in base a numero di anni considerato
  if((year_end - year_start) > 10):
      test_size = 0.2
  else:
      test_size = 0.3

  # seleziono sotto_df
  mask_1 = df.year_YYYY >= year_start
  mask_2 = df.year_YYYY <= year_end
  sub_df = df[mask_1]
  sub_df = sub_df[mask_2]

  # # # MODELLI ML # # #
  models = [{'ml_model': LogisticRegression, 'params': {'penalty':'l2', 'solver':'lbfgs', 'C':1, 'max_iter':1000}},
            {'ml_model': SVC, 'params': {'kernel':'linear', 'C':0.1, 'probability':True}},
            {'ml_model': SVC, 'params': {'kernel':'rbf', 'C':10, 'gamma':0.1, 'probability':True}},
            {'ml_model': KNeighborsClassifier, 'params': {'n_neighbors':30, 'metric':'manhattan'}},
            {'ml_model': DecisionTreeClassifier, 'params': {'criterion':'gini', 'max_depth':5}},
            {'ml_model': RandomForestClassifier, 'params': {'n_estimators':25, 'max_depth':8}},{'ml_model': ElasticNet, 'params': {'alpha':0.01, 'l1_ratio':0.1}}]

  results = []
  for model in models:
      results.append(best_ml_model(model['ml_model'], model['params'], sub_df, features_select[current_features], oversample_algorithm, test_size))

  df_part = pd.concat(results)

  # inserisco campi mancanti

  # year_range
  year_range_array = [year_range for i in range(df_part.shape[0])]
  df_part.insert(0, 'year_range', year_range_array)

  # features
  features_array = [current_features for i in range(df_part.shape[0])]
  df_part.insert(0, 'features', features_array)

  # new_features_params
  new_features_params_array = [new_features_params for i in range(df_part.shape[0])]
  df_part.insert(0, 'new_features_params', new_features_params_array)

  # category
  category_array = ['machine learning' for i in range(df_part.shape[0])]
  df_part.insert(0, 'category', category_array)

  return df_part

#Importo Dataset

In [12]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [13]:
drive.CreateFile({'id':'1-0o81KniM9hNtC8zqBaYyQWAmGdTYSS5'}).GetContentFile('dataset_final_4.0.csv')
df = pd.read_csv("dataset_final_4.0.csv").drop('Unnamed: 0',axis=1)

In [14]:
df.head()

Unnamed: 0,id,name,artists,release_date,year_YYYY,month_mm,month,year,cos(month),sin(month),valence,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence_new_0,acousticness_new_0,danceability_new_0,duration_ms_new_0,energy_new_0,instrumentalness_new_0,liveness_new_0,loudness_new_0,speechiness_new_0,tempo_new_0,valence_new_1,acousticness_new_1,danceability_new_1,duration_ms_new_1,energy_new_1,instrumentalness_new_1,liveness_new_1,loudness_new_1,speechiness_new_1,tempo_new_1,...,instrumentalness_new_3,liveness_new_3,loudness_new_3,speechiness_new_3,tempo_new_3,explicit,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,mode,explicit_new_0,key_new_0,mode_new_0,explicit_new_1,key_new_1,mode_new_1,explicit_new_2,key_new_2,mode_new_2,explicit_new_3,key_new_3,mode_new_3,season_1,season_2,season_3,season_4,hit,weeks,weeks_enc,past_pop_n_hit,past_pop_n_weeks
0,74Rr0afCO2T2v0Xmaq7Shz,Forgive Myself,['Sam Smith'],2020-11-24,2020,11,0.909091,0.983607,0.933013,0.25,0.207,0.917,0.758,0.03814,0.155,0.0,0.116,0.780434,0.0447,0.501119,0.261879,0.756754,0.015625,3e-05,0.475522,0.000701,0.007359,0.003661,0.01046,8.4e-05,0.230196,0.718261,0.023557,2.2e-05,0.459413,0.002985,0.006806,0.002818,0.008365,4e-06,...,0.007513,0.008539,0.002504,0.004098,7.7e-05,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0.0,0.057823,0.081376
1,0fGnjXBhmNfWGbL50VhkMd,Fix You - Live,['Sam Smith'],2020-11-24,2020,11,0.909091,0.983607,0.933013,0.25,0.302,0.805,0.345,0.045083,0.34,1.7e-05,0.125,0.794789,0.0342,0.559405,0.124093,0.523696,0.238106,0.000162,0.171905,0.0007,0.005583,0.001864,0.01297,0.011106,0.102616,0.491759,0.225623,0.000142,0.159867,0.002983,0.005089,0.001276,0.010614,0.013413,...,0.00751,0.006594,0.001054,0.005713,0.014593,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0.0,0.057823,0.081376
2,18cHFfnBjVpcQvy9eapiyU,ROCKSTAR,"['DaBaby', 'Roddy Ricch']",2020-11-23,2020,11,0.909091,0.983607,0.933013,0.25,0.497,0.247,0.746,0.030995,0.69,0.0,0.101,0.816453,0.164,0.369513,0.000622,2.5e-05,0.01149,4e-06,0.01115,0.000701,0.010864,0.000289,0.00088,0.071743,4.8e-05,0.000749,0.018285,8e-06,0.015888,0.002985,0.010221,9e-05,0.001595,0.065206,...,0.007513,0.01234,3.5e-05,0.004548,0.059579,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,37,10.5187,0.132653,0.071446
3,6zgPxJ3HlrD8Vqv7OVLG0Y,"Nacchite Ye Panaina (From ""Dohchay"")",['Arijit Singh'],2020-11-23,2020,11,0.909091,0.983607,0.933013,0.25,0.548,0.222,0.654,0.036355,0.713,0.0,0.312,0.822572,0.0533,0.554337,0.003682,0.001414,0.000862,1.3e-05,0.019538,0.000701,0.024102,9.3e-05,0.008605,0.009106,0.008566,0.003602,2.3e-05,8e-06,0.025795,0.002985,0.025777,4e-06,0.006722,0.011217,...,0.007513,0.023001,2e-06,0.002975,0.012336,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0.0,0.0,0.0
4,3y3lDeOb2c3FYCniav00EY,8 Figures (feat. Meek Mill),"['DaBaby', 'Meek Mill']",2020-11-20,2020,11,0.909091,0.983607,0.933013,0.25,0.702,0.0382,0.703,0.030478,0.555,0.0,0.246,0.812641,0.382,0.72772,0.101884,0.077065,0.001881,6e-06,0.009036,0.000701,0.005488,0.000467,0.073312,0.190277,0.123223,0.090012,0.00487,1.1e-05,0.005897,0.002985,0.006195,0.000199,0.078422,0.197857,...,0.007513,0.004827,0.000112,0.094652,0.197344,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0.0,0.20068,0.055704


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123212 entries, 0 to 123211
Data columns (total 95 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      123212 non-null  object 
 1   name                    123212 non-null  object 
 2   artists                 123212 non-null  object 
 3   release_date            123212 non-null  object 
 4   year_YYYY               123212 non-null  int64  
 5   month_mm                123212 non-null  int64  
 6   month                   123212 non-null  float64
 7   year                    123212 non-null  float64
 8   cos(month)              123212 non-null  float64
 9   sin(month)              123212 non-null  float64
 10  valence                 123212 non-null  float64
 11  acousticness            123212 non-null  float64
 12  danceability            123212 non-null  float64
 13  duration_ms             123212 non-null  float64
 14  energy              

In [16]:
num_non_hit = df[df.hit == 0].shape[0]
num_hit = df[df.hit == 1].shape[0]

print('non_hit = %d \nhit = %d' % (num_non_hit, num_hit))
print('percentuale hit = %.2f%%' % ((num_hit / (num_hit + num_non_hit))*100))

non_hit = 99654 
hit = 23558
percentuale hit = 19.12%


In [17]:
df.shape

(123212, 95)

#ML

In [18]:
category = 'machine learning'
oversample_algorithm = SMOTE

df_part_array = []

x = 3 # --> parametro da variare per considerare le new_features calcolate sui diversi intorni di anni

# # # FEATURES SELECT # # #

std_features_list = ['valence','acousticness','danceability','duration_ms','energy','instrumentalness','liveness','loudness','speechiness','tempo','explicit','key_0','key_1','key_2','key_3','key_4','key_5','key_6','key_7','key_8','key_9','key_10','key_11','mode']

if(x == 'all'):
  new_features = ['valence_new','acousticness_new','danceability_new','duration_ms_new','energy_new','instrumentalness_new','liveness_new','loudness_new','speechiness_new','tempo_new','explicit_new','key_new','mode_new']
  new_features_list = []

  for i in range(4):
    for feat in new_features:
      new_features_list.append(feat + '_' + str(i))
else:
  new_features_list = ['valence_new_'+str(x),'acousticness_new_'+str(x),'danceability_new_'+str(x),'duration_ms_new_'+str(x),'energy_new_'+str(x),'instrumentalness_new_'+str(x),'liveness_new_'+str(x),'loudness_new_'+str(x),'speechiness_new_'+str(x),'tempo_new_'+str(x),'explicit_new_'+str(x),'key_new_'+str(x),'mode_new_'+str(x)]

extra_features = ['month','year','cos(month)','sin(month)','season_1','season_2','season_3','season_4','past_pop_n_hit','past_pop_n_weeks']
targets_list = ['hit', 'weeks_enc']

features_select = {'standard features': std_features_list+extra_features, 'standard + new features': std_features_list+new_features_list+extra_features, 'new features': new_features_list+extra_features}

for current_features in ['standard + new features', 'new features']: # features_select.keys()

    if(current_features != 'standard features'):
      new_features_params = x
    else:
      new_features_params = None

    # # # YEAR RANGE SELECT # # #

    year_range_select = [(1960,2020), (1960,1969), (1970,1979), (1980,1989), (1990,1999), (2000,2009), (2010,2020)]

    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = [executor.submit(exec_per_year_range, year_range, df, category, oversample_algorithm, new_features_params) for year_range in year_range_select]

        output = []

        for f in concurrent.futures.as_completed(results):
          output.append(f.result())

    df_part_array.append(pd.concat(output))

df_tot = pd.concat(df_part_array)

training LogisticRegression...
training LogisticRegression...
...training completed for LogisticRegression in 1.01 seconds
training SVC...
...training completed for SVC in 29.34 seconds
training SVC...
...training completed for LogisticRegression in 37.5 seconds
training SVC...
...training completed for SVC in 79.58 seconds
training KNeighborsClassifier...
...training completed for KNeighborsClassifier in 1.89 seconds
training DecisionTreeClassifier...
...training completed for DecisionTreeClassifier in 0.87 seconds
training RandomForestClassifier...
...training completed for RandomForestClassifier in 1.44 seconds
training ElasticNet...
...training completed for ElasticNet in 0.11 seconds


  from ipykernel import kernelapp as app


training LogisticRegression...
...training completed for LogisticRegression in 1.06 seconds
training SVC...
...training completed for SVC in 27.78 seconds
training SVC...
...training completed for SVC in 80.96 seconds
training KNeighborsClassifier...
...training completed for KNeighborsClassifier in 1.94 seconds
training DecisionTreeClassifier...
...training completed for DecisionTreeClassifier in 1.02 seconds
training RandomForestClassifier...
...training completed for RandomForestClassifier in 1.56 seconds
training ElasticNet...
...training completed for ElasticNet in 0.12 seconds


  from ipykernel import kernelapp as app


training LogisticRegression...
...training completed for LogisticRegression in 1.19 seconds
training SVC...
...training completed for SVC in 26.18 seconds
training SVC...
...training completed for SVC in 69.97 seconds
training KNeighborsClassifier...
...training completed for KNeighborsClassifier in 2.05 seconds
training DecisionTreeClassifier...
...training completed for DecisionTreeClassifier in 1.16 seconds
training RandomForestClassifier...
...training completed for RandomForestClassifier in 1.6 seconds
training ElasticNet...
...training completed for ElasticNet in 0.11 seconds


  from ipykernel import kernelapp as app


training LogisticRegression...
...training completed for LogisticRegression in 1.36 seconds
training SVC...
...training completed for SVC in 27.32 seconds
training SVC...
...training completed for SVC in 82.13 seconds
training KNeighborsClassifier...
...training completed for KNeighborsClassifier in 2.23 seconds
training DecisionTreeClassifier...
...training completed for DecisionTreeClassifier in 1.24 seconds
training RandomForestClassifier...
...training completed for RandomForestClassifier in 1.85 seconds
training ElasticNet...
...training completed for ElasticNet in 0.12 seconds


  from ipykernel import kernelapp as app


training LogisticRegression...
...training completed for LogisticRegression in 1.66 seconds
training SVC...
...training completed for SVC in 24.17 seconds
training SVC...
...training completed for SVC in 70.41 seconds
training KNeighborsClassifier...
...training completed for KNeighborsClassifier in 2.53 seconds
training DecisionTreeClassifier...
...training completed for DecisionTreeClassifier in 1.63 seconds
training RandomForestClassifier...
...training completed for RandomForestClassifier in 2.12 seconds
training ElasticNet...
...training completed for ElasticNet in 0.15 seconds


  from ipykernel import kernelapp as app


training LogisticRegression...
...training completed for LogisticRegression in 2.18 seconds
training SVC...
...training completed for SVC in 30.26 seconds
training SVC...
...training completed for SVC in 87.87 seconds
training KNeighborsClassifier...
...training completed for KNeighborsClassifier in 3.45 seconds
training DecisionTreeClassifier...
...training completed for DecisionTreeClassifier in 2.21 seconds
training RandomForestClassifier...
...training completed for RandomForestClassifier in 2.9 seconds
training ElasticNet...
...training completed for ElasticNet in 0.17 seconds
...training completed for SVC in 1145.81 seconds
training SVC...
...training completed for SVC in 2882.4 seconds
training KNeighborsClassifier...
...training completed for KNeighborsClassifier in 24.4 seconds
training DecisionTreeClassifier...
...training completed for DecisionTreeClassifier in 19.7 seconds
training RandomForestClassifier...
...training completed for RandomForestClassifier in 22.51 seconds
t

  from ipykernel import kernelapp as app


training LogisticRegression...
...training completed for LogisticRegression in 0.67 seconds
training SVC...
...training completed for SVC in 32.48 seconds
training SVC...
...training completed for SVC in 93.32 seconds
training KNeighborsClassifier...
...training completed for KNeighborsClassifier in 1.5 seconds
training DecisionTreeClassifier...
...training completed for DecisionTreeClassifier in 0.54 seconds
training RandomForestClassifier...
...training completed for RandomForestClassifier in 1.13 seconds
training ElasticNet...
...training completed for ElasticNet in 0.13 seconds


  from ipykernel import kernelapp as app


training LogisticRegression...
...training completed for LogisticRegression in 0.52 seconds
training SVC...
...training completed for SVC in 28.6 seconds
training SVC...
...training completed for SVC in 81.0 seconds
training KNeighborsClassifier...
...training completed for KNeighborsClassifier in 1.42 seconds
training DecisionTreeClassifier...
...training completed for DecisionTreeClassifier in 0.46 seconds
training RandomForestClassifier...
...training completed for RandomForestClassifier in 1.05 seconds
training ElasticNet...
...training completed for ElasticNet in 0.1 seconds


  from ipykernel import kernelapp as app


training LogisticRegression...
...training completed for LogisticRegression in 0.54 seconds
training SVC...
...training completed for SVC in 30.41 seconds
training SVC...
...training completed for SVC in 89.14 seconds
training KNeighborsClassifier...
...training completed for KNeighborsClassifier in 1.6 seconds
training DecisionTreeClassifier...
...training completed for DecisionTreeClassifier in 0.58 seconds
training RandomForestClassifier...
...training completed for RandomForestClassifier in 1.13 seconds
training ElasticNet...
...training completed for ElasticNet in 0.11 seconds


  from ipykernel import kernelapp as app


training LogisticRegression...
...training completed for LogisticRegression in 0.7 seconds
training SVC...
...training completed for SVC in 26.96 seconds
training SVC...
...training completed for SVC in 74.29 seconds
training KNeighborsClassifier...
...training completed for KNeighborsClassifier in 1.58 seconds
training DecisionTreeClassifier...
...training completed for DecisionTreeClassifier in 0.66 seconds
training RandomForestClassifier...
...training completed for RandomForestClassifier in 1.21 seconds
training ElasticNet...
...training completed for ElasticNet in 0.15 seconds


  from ipykernel import kernelapp as app


training LogisticRegression...
...training completed for LogisticRegression in 0.81 seconds
training SVC...
...training completed for SVC in 32.31 seconds
training SVC...
...training completed for SVC in 95.63 seconds
training KNeighborsClassifier...
...training completed for KNeighborsClassifier in 1.9 seconds
training DecisionTreeClassifier...
...training completed for DecisionTreeClassifier in 0.92 seconds
training RandomForestClassifier...
...training completed for RandomForestClassifier in 1.52 seconds
training ElasticNet...
...training completed for ElasticNet in 0.16 seconds
...training completed for SVC in 1135.93 seconds
training SVC...
...training completed for SVC in 2841.48 seconds
training KNeighborsClassifier...
...training completed for KNeighborsClassifier in 11.0 seconds
training DecisionTreeClassifier...
...training completed for DecisionTreeClassifier in 5.52 seconds
training RandomForestClassifier...
...training completed for RandomForestClassifier in 8.87 seconds
t

In [19]:
df_tot.sort_values('test_accuracy', ascending=False).head(50)

Unnamed: 0,category,new_features_params,features,year_range,algorithm,parameters,class_balancing,problem,test_accuracy,test_log_loss,test_confusion_matrix,test_precision_0,test_precision_1,test_recall_0,test_recall_1,test_fscore_0,test_fscore_1,train_accuracy,train_log_loss,train_confusion_matrix,train_precision_0,train_precision_1,train_recall_0,train_recall_1,train_fscore_0,train_fscore_1,test_MSE,test_r2,train_MSE,train_r2,exec_time
0,machine learning,3,standard + new features,"(1980, 1989)",LogisticRegression,"{'penalty': 'l2', 'solver': 'lbfgs', 'C': 1, '...",SMOTE + random undersampling,classification,0.716571,0.555613,"[[628, 247], [249, 626]]",0.716078,0.717068,0.717714,0.715429,0.716895,0.716247,0.735515,0.534943,"[[4812, 2109], [1552, 5369]]",0.756128,0.717973,0.695275,0.775755,0.724426,0.745746,,,,,1.19
0,machine learning,3,standard + new features,"(1980, 1989)",SVC,"{'kernel': 'linear', 'C': 0.1, 'probability': ...",SMOTE + random undersampling,classification,0.709357,0.57335,"[[549, 306], [191, 664]]",0.741892,0.684536,0.642105,0.776608,0.688401,0.727671,0.724924,0.538418,"[[4418, 2491], [1310, 5599]]",0.771299,0.692089,0.639456,0.810392,0.699217,0.746583,,,,,26.18
0,machine learning,3,standard + new features,"(1960, 1969)",SVC,"{'kernel': 'linear', 'C': 0.1, 'probability': ...",SMOTE + random undersampling,classification,0.707317,0.57495,"[[421, 235], [149, 507]]",0.738596,0.683288,0.641768,0.772866,0.686786,0.725322,0.703701,0.564036,"[[4309, 2878], [1381, 5806]]",0.757293,0.668586,0.599555,0.807848,0.669255,0.731649,,,,,29.34
0,machine learning,3,standard + new features,"(1980, 1989)",RandomForestClassifier,"{'n_estimators': 25, 'max_depth': 8}",SMOTE + random undersampling,classification,0.704778,0.610477,"[[623, 256], [263, 616]]",0.70316,0.706422,0.70876,0.700796,0.705949,0.703598,0.747111,0.498126,"[[5018, 1906], [1596, 5328]]",0.758694,0.736522,0.724726,0.769497,0.741321,0.752649,,,,,1.6
0,machine learning,3,standard + new features,"(1980, 1989)",KNeighborsClassifier,"{'n_neighbors': 30, 'metric': 'manhattan'}",SMOTE + random undersampling,classification,0.703991,0.746228,"[[622, 280], [254, 648]]",0.710046,0.698276,0.689579,0.718404,0.699663,0.708197,0.735044,0.516129,"[[4906, 2031], [1645, 5292]]",0.748893,0.722655,0.707222,0.762866,0.727461,0.742216,,,,,2.05
0,machine learning,3,standard + new features,"(1960, 1969)",RandomForestClassifier,"{'n_estimators': 25, 'max_depth': 8}",SMOTE + random undersampling,classification,0.697227,0.61119,"[[408, 241], [152, 497]]",0.728571,0.673442,0.628659,0.765794,0.674938,0.716655,0.727064,0.523077,"[[4501, 2682], [1239, 5944]]",0.784146,0.68908,0.626618,0.827509,0.696587,0.751977,,,,,1.44
0,machine learning,3,standard + new features,"(1960, 1969)",DecisionTreeClassifier,"{'criterion': 'gini', 'max_depth': 5}",SMOTE + random undersampling,classification,0.696541,0.578819,"[[411, 225], [161, 475]]",0.718531,0.678571,0.646226,0.746855,0.680464,0.711078,0.719373,0.545907,"[[4415, 2760], [1267, 5908]]",0.777015,0.681587,0.615331,0.823415,0.686785,0.745818,,,,,0.87
0,machine learning,3,standard + new features,"(1980, 1989)",DecisionTreeClassifier,"{'criterion': 'gini', 'max_depth': 5}",SMOTE + random undersampling,classification,0.69397,0.638907,"[[553, 326], [212, 667]]",0.722876,0.671702,0.629124,0.758817,0.672749,0.712607,0.73433,0.517037,"[[4608, 2316], [1363, 5561]]",0.77173,0.705979,0.665511,0.803148,0.714696,0.751436,,,,,1.16
0,machine learning,3,standard + new features,"(1960, 1969)",LogisticRegression,"{'penalty': 'l2', 'solver': 'lbfgs', 'C': 1, '...",SMOTE + random undersampling,classification,0.69076,0.576071,"[[426, 245], [170, 501]]",0.714765,0.671582,0.634873,0.746647,0.672455,0.707128,0.711089,0.564499,"[[4674, 2522], [1636, 5560]]",0.740729,0.687949,0.649528,0.772651,0.692137,0.727844,,,,,1.01
0,machine learning,3,standard + new features,"(1980, 1989)",SVC,"{'kernel': 'rbf', 'C': 10, 'gamma': 0.1, 'prob...",SMOTE + random undersampling,classification,0.689966,0.595677,"[[563, 324], [226, 661]]",0.713561,0.671066,0.634724,0.745209,0.671838,0.706197,0.735855,0.534114,"[[4365, 2563], [1097, 5831]]",0.799158,0.694663,0.630052,0.841657,0.7046,0.761128,,,,,69.97


#Export

In [20]:
#Export

from google.colab import  drive

# mounts the google drive to Colab Notebook
drive.mount('/content/drive',force_remount=True)

df_tot.to_csv('/content/drive/My Drive/Colab Notebooks/datasets/results_ML_4.1_' + str(x) + '.csv')

Mounted at /content/drive
