# Modélisation

In [1]:
import pandas as pd
import numpy as np
import missingno as msno


In [2]:
df = pd.read_csv('tracks.csv')
df.release_date = pd.to_datetime(df.release_date)


In [None]:
df_2000 = df[df.release_date > '2000']
df_2000.drop(['id'], axis=1, inplace=True)
df_2000.drop_duplicates(inplace=True)
df_2000['Year'] = pd.to_datetime(df_2000.release_date).dt.year
df_2000['month'] = pd.to_datetime(df_2000.release_date).dt.month


In [28]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, learning_curve, cross_val_predict, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, ElasticNet, SGDRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from catboost import CatBoostRegressor
R_algorithm = [LinearRegression(), ElasticNet(), SGDRegressor(
), XGBRegressor(), SVR(), BayesianRidge(), KernelRidge(), CatBoostRegressor()]


In [6]:
X = df_2000.drop(['popularity', 'name', 'artists',
                 'release_date', 'id_artists'], axis=1)
y = df_2000['popularity']

# Convert the list column to string column

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Preprocess the training data
num_cols = X.select_dtypes(include='number').columns.tolist()
cat_cols = X.select_dtypes(include='object').columns.tolist()

num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_cols),
                                               ('cat', cat_transformer, cat_cols)])


MAE: 13.284096741596448


In [None]:
import os
import warnings
import sys
import mlflow.sklearn
import logging
from urllib.parse import urlparse


# MLFLOW

In [9]:

experiment_id = mlflow.create_experiment("Spotify_prediction")


In [13]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
eval_data = X_train
eval_data["target"] = y_train


In [26]:
for algorithm in R_algorithm:

    pipe = make_pipeline(preprocessor, PolynomialFeatures(), algorithm)
    pipe.fit(X_train, y_train)
    print("l'algorithme {} est terminé, enregistrement dans mlflow ").__format__(
        str(algorithm))
    with mlflow.start_run(experiment_id=experiment_id):

        model_info = mlflow.sklearn.log_model(pipe, "model")
        for k, v in pipe.named_steps[-1].get_params().items():
            mlflow.log_param(k, v)
        result = mlflow.evaluate(
            model_info.model_uri,
            eval_data,
            targets="target",
            model_type="regressor",
            evaluators=["default"],
        )


[Pipeline] . (step 1 of 3) Processing columntransformer, total=   0.1s
[Pipeline]  (step 2 of 3) Processing polynomialfeatures, total=   0.2s
[Pipeline] ........ (step 3 of 3) Processing elasticnet, total=   2.6s


2023/02/21 15:52:16 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/02/21 15:52:18 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Permutation is used.
Permutation explainer: 2001it [08:41,  3.80it/s]                          
Unable to serialize underlying model using MLflow, will use SHAP serialization


#GENRE

In [3]:
import pandas as pd

In [4]:
genre = pd.read_csv('data_110000k_lignes.csv')


In [5]:
liste_genre = genre.genre.unique().tolist()

In [49]:
base_url = 'http://ws.audioscrobbler.com/2.0/'
params = {'method': 'track.search',
        'track': 'DO NOT DISTURB MODE (feat. Ourealgoat, 365lit & Blase)',
        'api_key': 'd30646344918494a4e45ea08ad6fc629',
        'format': 'json'}

# Make the request to LastFM
response = requests.get(base_url, params=params)

# Check to make sure the request was successful
if response.status_code == 200:
    # Get the tags from the response
    data = response.json()
    results = data['results']['trackmatches']['track']
results

[]

In [47]:
base_url = 'http://ws.audioscrobbler.com/2.0/'
params = {'method': 'track.gettoptags',
        'artist': 'Trapt',
        'track': 'LOVE HATE RELATIONSHIP',
        'api_key': 'd30646344918494a4e45ea08ad6fc629',
        'format': 'json'}

# Make the request to LastFM
response = requests.get(base_url, params=params)
# Check to make sure the request was successful
if response.status_code == 200:
    # Get the tags from the response
    tags = response.json().get('toptags', {}).get('tag', [])
tags

[{'count': 100, 'name': '2013', 'url': 'https://www.last.fm/tag/2013'},
 {'count': 75, 'name': 'rock', 'url': 'https://www.last.fm/tag/rock'},
 {'count': 50,
  'name': 'post-grunge',
  'url': 'https://www.last.fm/tag/post-grunge'},
 {'count': 50,
  'name': 'love at first listen',
  'url': 'https://www.last.fm/tag/love+at+first+listen'},
 {'count': 50,
  'name': 'Music Choice: Rock',
  'url': 'https://www.last.fm/tag/Music+Choice:+Rock'},
 {'count': 25, 'name': '10s', 'url': 'https://www.last.fm/tag/10s'},
 {'count': 25, 'name': '2010s', 'url': 'https://www.last.fm/tag/2010s'},
 {'count': 25,
  'name': 'san jose sharks',
  'url': 'https://www.last.fm/tag/san+jose+sharks'},
 {'count': 25,
  'name': 'good left undone',
  'url': 'https://www.last.fm/tag/good+left+undone'}]

In [60]:
def get_genre(track_name,artist_name):

    base_url = 'http://ws.audioscrobbler.com/2.0/'
    params = {'method': 'track.gettoptags',
            'artist': artist_name,
            'track': track_name,
            'api_key': 'd30646344918494a4e45ea08ad6fc629',
            'format': 'json'}

    # Make the request to LastFM
    response = requests.get(base_url, params=params)
    # Check to make sure the request was successful
    if response.status_code == 200:
        # Get the tags from the response
        tags = response.json().get('toptags', {}).get('tag', [])
        for tag in tags:
            
            if tag.get('name') in liste_genre:
                return tag.get('name')

In [63]:
get_genre('Shape of you','ed Sheeran')

'pop'

In [62]:
get_genre('The Final Command - Live 1984')

TypeError: get_genre() missing 1 required positional argument: 'artist_name'

In [57]:
import pandas as pd
import os 
client_ID = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')
import base64

In [58]:
def get_access_token(client_id, client_secret):

    client_str = f"{client_id}:{client_secret}"
    client_b64 = base64.b64encode(client_str.encode()).decode()

    headers = {"Authorization": f"Basic {client_b64}"}
    data = {"grant_type": "client_credentials"}

    
    response = requests.post("https://accounts.spotify.com/api/token", headers=headers, data=data)

    token_data = response.json()
    
    return token_data['access_token']

In [59]:
get_access_token(client_ID,client_secret)

ConnectionError: HTTPSConnectionPool(host='accounts.spotify.com', port=443): Max retries exceeded with url: /api/token (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f57d0b543a0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

In [31]:
df1 = pd.read_csv('df_projet_recapitulatif_200.csv')


In [32]:
df1.drop_duplicates('track_id', inplace=True)
df1.drop_duplicates(['track_duration', 'track_release_date', 'nombre_artist',
                     'track_release_month', 'danceability', 'energy', 'key', 'loudness',
                     'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                     'valence', 'tempo', 'time_signature', 'pays_du_producteur',
                     'popularity'], inplace=True)
df1.drop(['track_id', 'album_type'], axis=1, inplace=True)


In [33]:
df1['genre']= 'None'

In [34]:
list_of_all = []

In [43]:
for index, row in df1.iterrows():
    row.genre=row.genre.replace('genre',get_genre(row.track_name))


Tiempo es la Cuestión
Aggressive Perfector - Live 1984
Face The Slayer - Live 1984
The Final Command - Live 1984
Captor Of Sin - Live 1984
Necrophiliac - Live 1984
Crionics - Live 1984
Fight Till Death - Live 1984
The Antichrist - Live 1984
Haunting The Chapel - Live 1984
Black Magic - Live 1984
Die By The Sword - Live 1984
Evil Has No Boundaries - Live 1984
Show No Mercy - Live 1984
LV COCO
De Ti Me Enamoré
REMINISCE
Used To
FOTO DI NOI (feat. Drast)
Doidona
Estrellita Dónde Estás
Brahms Cancion de Cuna (a Dormir)
Arrorró Mi Niño
La Lechuza
Calla Pequeño
La Araña Pequeñita
María Tenía un Corderito
El Viejo Mac Donald
DON'T WAKE ME UP (feat. Jayci yucca)
KISS YOU
DUMB DUMB DUMB
LOVE HATE RELATIONSHIP


IndexError: list index out of range

In [36]:
df1.genre

0         None
1         None
2         None
3         None
4         None
          ... 
230841    None
230842    None
230843    None
230844    None
230845    None
Name: genre, Length: 225542, dtype: object

In [None]:
df1.genre.value_counts()

None    225542
Name: genre, dtype: int64