In [3]:
!pip install imblearn

Collecting imblearn
  Using cached https://files.pythonhosted.org/packages/81/a7/4179e6ebfd654bd0eac0b9c06125b8b4c96a9d0a8ff9e9507eb2a26d2d7e/imblearn-0.0-py2.py3-none-any.whl
Collecting imbalanced-learn (from imblearn)
  Using cached https://files.pythonhosted.org/packages/c8/81/8db4d87b03b998fda7c6f835d807c9ae4e3b141f978597b8d7f31600be15/imbalanced_learn-0.7.0-py3-none-any.whl
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.7.0 imblearn-0.0


In [4]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import export_graphviz, DecisionTreeClassifier, _tree, DecisionTreeRegressor, plot_tree
from bokeh.plotting import figure
from sklearn import tree
from sklearn import metrics
from collections import Counter
from imblearn.over_sampling import SMOTE
from bokeh.io import show
import pandas as pd

In [5]:
##### DECISON TREE (POPULARITY) #####

# Loading the file
df = pd.read_csv('data.csv', sep=",", encoding = "ISO-8859-1") 

# Drop non numerical variables
df = df.drop(['artists'], axis=1)
df = df.drop(['name'], axis=1)
df = df.drop(['id'], axis=1)
df = df.drop(['release_date'], axis=1)
df = df.drop(['year'], axis=1)


# Sample to avoid oversampling
df_ok = df[df['popularity'] > 80]
df_ko = df.sample(df_ok.shape[0])
df_test = pd.concat([df_ok, df_ko])

# Create the famous variable 
df_test['famous'] = (df_test['popularity'] > 80).astype(int)

# Delete popularity
df_test = df_test.drop(['popularity'], axis=1)

# split
xtrain, xtest, ytrain, ytest = train_test_split(df_test.loc[:,df_test.columns != 'famous'],
                                            df_test['famous'],
                                            test_size =0.2,
                                            random_state =42)

print('Original data shape %s' % Counter(ytrain))

# learn
DR = DecisionTreeClassifier(criterion = "gini", max_depth = 5, random_state=12, min_samples_split=5)
DR = DR.fit(xtrain, ytrain)

# score
y_pred = DR.predict(xtest)
print(metrics.classification_report(ytest,y_pred))
metrics.confusion_matrix(ytest,y_pred)
export_graphviz(DR,
            out_file="Arbre.dot",
            feature_names= xtest.columns,
            class_names=['famous','not_famous'],
            rounded =True,
            proportion =False,
            node_ids = True,
            filled =True)

variables = list(xtest.columns)
counts = DR.feature_importances_

#global feature importance
x_range = sorted(variables, key =lambda x: counts[variables.index(x)], reverse=True)
print(x_range)
p = figure(x_range=x_range,
           plot_height=420,
           plot_width =1000,
           title="Features importance")

p.vbar(x = variables, top = counts, width = 0.5)
p.xaxis.major_label_orientation = 0.5
show(p)

Original data shape Counter({0: 366, 1: 365})
              precision    recall  f1-score   support

           0       0.85      0.84      0.84        87
           1       0.86      0.86      0.86        96

    accuracy                           0.85       183
   macro avg       0.85      0.85      0.85       183
weighted avg       0.85      0.85      0.85       183

['loudness', 'danceability', 'valence', 'energy', 'speechiness', 'duration_ms', 'explicit', 'instrumentalness', 'mode', 'liveness', 'acousticness', 'key', 'tempo']


In [7]:
##### DECISON TREE (HIT SONGS) #####

# DataSet de départ
data = pd.read_csv("data.csv", encoding="iso-8859-1")

# DataSet des Top Hit (jusqu'à 2017)
top_songs = pd.read_csv("top_data.csv", encoding="iso-8859-1")

# On filtre donc notre DataSet de départ pour conserver seuelement les musiques avec date > 2017
data_2017 = data[data["year"]<2017]

# On recherche l'existence des musiques dans le DataSet des Hit Songs pour créer une nouvelle variable de succès 
data_2017["top"] = data_2017["name"].isin(top_songs["title"])
                                                   
# Affichage de la nouvelle variable
print(data_2017["top"].value_counts())
print(data_2017["name"][data_2017["top"]==True])


# Drop non numerical variables
data_2017 = data_2017.drop(['artists'], axis=1)
data_2017 = data_2017.drop(['name'], axis=1)
data_2017 = data_2017.drop(['id'], axis=1)
data_2017 = data_2017.drop(['release_date'], axis=1)
data_2017 = data_2017.drop(['year'], axis=1)


# Sample to avoid oversampling
df_ok = data_2017[data_2017['top'] == 1]
df_ko = data_2017.sample(df_ok.shape[0])
df_test = pd.concat([df_ok, df_ko])

# Delete popularity
df_test = df_test.drop(['popularity'], axis=1)

# split
xtrain, xtest, ytrain, ytest = train_test_split(df_test.loc[:,df_test.columns != 'top'],
                                            df_test['top'],
                                            test_size =0.2,
                                            random_state =42)

print('Original data shape %s' % Counter(ytrain))

# learn
DR = DecisionTreeClassifier(criterion = "gini", max_depth = 5, random_state=12, min_samples_split=5)
DR = DR.fit(xtrain, ytrain)

# score
y_pred = DR.predict(xtest)
print(metrics.classification_report(ytest,y_pred))
metrics.confusion_matrix(ytest,y_pred)
export_graphviz(DR,
            out_file="Arbre.dot",
            feature_names= xtest.columns,
            class_names=['famous','not_famous'],
            rounded =True,
            proportion =False,
            node_ids = True,
            filled =True)

variables = list(xtest.columns)
counts = DR.feature_importances_

#global feature importance
x_range = sorted(variables, key =lambda x: counts[variables.index(x)], reverse=True)
print(x_range)
p = figure(x_range=x_range,
           plot_height=420,
           plot_width =1000,
           title="Features importance")

p.vbar(x = variables, top = counts, width = 0.5)
p.xaxis.major_label_orientation = 0.5
show(p)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


False    141307
True      20846
Name: top, dtype: int64
32                 Loving You
336          Indian Love Call
760                    Always
828             Runaway Train
883                     Babel
                 ...         
169479            Be Yourself
169483    Can I Get A Witness
169497             I Miss You
169500             Hallelujah
169502       She's Mine Pt. 2
Name: name, Length: 20846, dtype: object
Original data shape Counter({True: 18844, False: 14509})
              precision    recall  f1-score   support

       False       0.71      0.46      0.56      3593
        True       0.68      0.86      0.76      4746

    accuracy                           0.69      8339
   macro avg       0.69      0.66      0.66      8339
weighted avg       0.69      0.69      0.67      8339

['acousticness', 'duration_ms', 'speechiness', 'instrumentalness', 'danceability', 'liveness', 'valence', 'energy', 'loudness', 'explicit', 'key', 'mode', 'tempo']
