In [5]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import export_graphviz, DecisionTreeClassifier, _tree, DecisionTreeRegressor, plot_tree
from bokeh.plotting import figure
from sklearn import tree
from sklearn import metrics
from collections import Counter
from imblearn.over_sampling import SMOTE
from bokeh.io import show
import pandas as pd

In [6]:
# Loading the file
df = pd.read_csv('data.csv', sep=",", encoding = "ISO-8859-1") 

# Create the famous variable 
df['famous'] = (df['popularity'] > 50).astype(int)

# Drop non numerical variables
df = df.drop(['artists'], axis=1)
df = df.drop(['name'], axis=1)
df = df.drop(['id'], axis=1)
df = df.drop(['release_date'], axis=1)
df = df.drop(['popularity'], axis=1)
df = df.drop(['year'], axis=1)

# split
xtrain, xtest, ytrain, ytest = train_test_split(df.loc[:,df.columns != 'famous'],
                                            df['famous'],
                                            test_size =0.2,
                                            random_state =42)

print('Original data shape %s' % Counter(ytrain))

# oversampling aux choix entre SMOTE, ADASYN et RandomOverSampler
sm = SMOTE()
xtrain, ytrain = sm.fit_resample(xtrain, ytrain)
print('Resampled data shape %s' % Counter(ytrain))

# learn
DR = DecisionTreeClassifier(criterion = "gini", max_depth = 5, random_state=12, min_samples_split=5)
DR = DR.fit(xtrain, ytrain)

# score
y_pred = DR.predict(xtest)
print(metrics.classification_report(ytest,y_pred))
metrics.confusion_matrix(ytest,y_pred)
export_graphviz(DR,
            out_file="Arbre.dot",
            feature_names= xtest.columns,
            class_names=['famous','not_famous'],
            rounded =True,
            proportion =False,
            node_ids = True,
            filled =True)

variables = list(xtest.columns)
counts = DR.feature_importances_

#global feature importance
p = figure(x_range=sorted(variables, key =lambda x: counts[variables.index(x)], reverse=True),
           plot_height=420,plot_width =1000,
           title="Features importance")

p.vbar(x = variables, top = counts, width = 0.5)
p.xaxis.major_label_orientation = 0.5
#show(p)

Original data shape Counter({0: 107235, 1: 28692})
Resampled data shape Counter({0: 107235, 1: 107235})
              precision    recall  f1-score   support

           0       0.91      0.75      0.82     26816
           1       0.43      0.71      0.54      7166

    accuracy                           0.74     33982
   macro avg       0.67      0.73      0.68     33982
weighted avg       0.81      0.74      0.76     33982

