# Preprocessing

In [14]:
import os
import io
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import warnings
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
warnings.filterwarnings('ignore')

##### Import data + artiste_note ###

In [15]:
#Import Data (all data) and data_by_artist
dfData=pd.read_csv("data.csv")
dataArtist=pd.read_csv("data_by_artist.csv",)
dataArtist=dataArtist[["artists","popularity"]]
#Merge artists with data
dfData["artists"]=dfData["artists"].str.replace("[","").str.replace("]","").str.replace("'",'').str.split(",")
dfData=dfData.explode("artists")
df=pd.merge(dfData,dataArtist,left_on="artists",right_on="artists")
df=df.rename(columns={"popularity_x":"popularity","popularity_y":"artistPop"})
df=df.drop_duplicates(subset=['id'])
data = df

##### Add a new column top 100 hits ### 

In [16]:
# DataSet des Top Hit (jusqu'à 2017)
top_songs = pd.read_csv("top_data.csv", encoding="iso-8859-1")

# On filtre donc notre DataSet de départ pour conserver seuelement les musiques avec date > 2017
data_2017 = data[data["year"]<2017]

# On recherche l'existence des musiques dans le DataSet des Hit Songs pour créer une nouvelle variable de succès 
data_2017["top"] = data_2017["name"].isin(top_songs["title"])
df=data_2017

##### Add a note for song names

In [17]:
name_good=pd.read_csv("name_grade.csv")
del name_good["Unnamed: 0"]
# df=df.rename(columns={"name_good_x":"popularity","name_good_y":"artistPop"})
df=pd.merge(df, name_good, how='inner', on=['name'])
df=df.drop_duplicates(subset=['id'])

##### Take only last n years of a dataFrame from a year

In [18]:
def split_df_by_year(df, range_max, year):
    """
        df : dataframe d'entrée
        step : range maximum d'annees que l'on veut pout notre dataset
        year : annnee de sortie de la musique à preédire
        Renvoie le dataframe respectant le range_max et l'annee
    """

    mask = (df['year'] <= year) & (df['year'] > year - range_max)
    current_df = df.loc[mask]
                
    return current_df
         
df = split_df_by_year(df, 5, 2015)

##### Creating train/test datasets

In [19]:
df.rename(columns={'energy':'Energy',"danceability":"Danceability","loudness":"Loudness(dB)","acousticness":"Acousticness",},inplace=True)

#oversampling 
# print('Original data shape %s' % Counter(y_train))
# sm = SMOTE()
# x_train, y_train = sm.fit_resample(x_train, y_train)
# print('Resampled data shape %s' % Counter(y_train))

#undersampling
# rus = RandomUnderSampler()
# print('Original data shape %s' % Counter(y_train))
# x_train, y_train = rus.fit_sample(x_train, y_train)
# print('Resampled data shape %s' % Counter(y_train))

x=df.loc[:,['Energy','Danceability','Loudness(dB)','Acousticness',"duration_ms","instrumentalness","speechiness","tempo","valence","year","artistPop","name_good"]].values
y=df.loc[:,'top'].values
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)

sc=StandardScaler()
sc.fit(x_train)
x_train=sc.transform(x_train)
x_test=sc.transform(x_test)

## Apply a model now