# Modélisation

In [1]:
import pandas as pd
import numpy as np
import missingno as msno


In [2]:
df = pd.read_csv('tracks.csv')
df.release_date = pd.to_datetime(df.release_date)


In [None]:
df_2000 = df[df.release_date > '2000']
df_2000.drop(['id'], axis=1, inplace=True)
df_2000.drop_duplicates(inplace=True)
df_2000['Year'] = pd.to_datetime(df_2000.release_date).dt.year
df_2000['month'] = pd.to_datetime(df_2000.release_date).dt.month


In [28]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, learning_curve, cross_val_predict, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, ElasticNet, SGDRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from catboost import CatBoostRegressor
R_algorithm = [LinearRegression(), ElasticNet(), SGDRegressor(
), XGBRegressor(), SVR(), BayesianRidge(), KernelRidge(), CatBoostRegressor()]


In [6]:
X = df_2000.drop(['popularity', 'name', 'artists',
                 'release_date', 'id_artists'], axis=1)
y = df_2000['popularity']

# Convert the list column to string column

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Preprocess the training data
num_cols = X.select_dtypes(include='number').columns.tolist()
cat_cols = X.select_dtypes(include='object').columns.tolist()

num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_cols),
                                               ('cat', cat_transformer, cat_cols)])


MAE: 13.284096741596448


In [None]:
import os
import warnings
import sys
import mlflow.sklearn
import logging
from urllib.parse import urlparse


# MLFLOW

In [9]:

experiment_id = mlflow.create_experiment("Spotify_prediction")


In [13]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
eval_data = X_train
eval_data["target"] = y_train


In [26]:
for algorithm in R_algorithm:

    pipe = make_pipeline(preprocessor, PolynomialFeatures(), algorithm)
    pipe.fit(X_train, y_train)
    print("l'algorithme {} est terminé, enregistrement dans mlflow ").__format__(
        str(algorithm))
    with mlflow.start_run(experiment_id=experiment_id):

        model_info = mlflow.sklearn.log_model(pipe, "model")
        for k, v in pipe.named_steps[-1].get_params().items():
            mlflow.log_param(k, v)
        result = mlflow.evaluate(
            model_info.model_uri,
            eval_data,
            targets="target",
            model_type="regressor",
            evaluators=["default"],
        )


[Pipeline] . (step 1 of 3) Processing columntransformer, total=   0.1s
[Pipeline]  (step 2 of 3) Processing polynomialfeatures, total=   0.2s
[Pipeline] ........ (step 3 of 3) Processing elasticnet, total=   2.6s


2023/02/21 15:52:16 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/02/21 15:52:18 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Permutation is used.
Permutation explainer: 2001it [08:41,  3.80it/s]                          
Unable to serialize underlying model using MLflow, will use SHAP serialization


#GENRE

In [9]:
genre = pd.read_csv('data_110000k_lignes.csv')


In [10]:
liste_genre = genre.genre.unique().tolist()

In [11]:
import requests

def get_genre(track_name):

    base_url = 'http://ws.audioscrobbler.com/2.0/'
    params = {'method': 'track.search',
            'track': track_name,
            'api_key': 'd30646344918494a4e45ea08ad6fc629',
            'format': 'json'}

    # Make the request to LastFM
    response = requests.get(base_url, params=params)

    # Check to make sure the request was successful
    if response.status_code == 200:
        # Get the tags from the response
        data = response.json()
        results = data['results']['trackmatches']['track']
        try :
            artist_name = results[0]['artist'] 
        except :
            artist_name = 'None'
    
    base_url = 'http://ws.audioscrobbler.com/2.0/'
    params = {'method': 'track.gettoptags',
            'artist': artist_name,
            'track': track_name,
            'api_key': 'd30646344918494a4e45ea08ad6fc629',
            'format': 'json'}

    # Make the request to LastFM
    response = requests.get(base_url, params=params)

    # Check to make sure the request was successful
    if response.status_code == 200:
        # Get the tags from the response
        tags = response.json().get('toptags', {}).get('tag', [])
        genre = 
        # Print out the tags
        for tag in tags:
            var = tag.get('name')
            if var in liste_genre:
                genre = var
                break
            return genre
    else :
        return 'Others'
    


In [12]:
get_genre('Thriller')

'pop'

In [13]:
import pandas as pd


In [14]:
df1 = pd.read_csv('df_projet_recapitulatif_200.csv')


In [15]:
df1.drop_duplicates('track_id', inplace=True)
df1.drop_duplicates(['track_duration', 'track_release_date', 'nombre_artist',
                     'track_release_month', 'danceability', 'energy', 'key', 'loudness',
                     'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                     'valence', 'tempo', 'time_signature', 'pays_du_producteur',
                     'popularity'], inplace=True)
df1.drop(['track_id', 'album_type'], axis=1, inplace=True)



In [16]:
df1['genre']= 'None'

In [17]:
list_of_all = []

In [23]:
for index, row in df1.iterrows():
    row.genre=1


KeyboardInterrupt: 

In [24]:
df1.genre

0         None
1         None
2         None
3         None
4         None
          ... 
230841    None
230842    None
230843    None
230844    None
230845    None
Name: genre, Length: 225542, dtype: object

In [None]:
df1.genre.value_counts()

None    225542
Name: genre, dtype: int64