# Regression Model

**Imports**

In [1]:
import numpy as np
import pandas as pd

**Load the dataset**

In [2]:
df = pd.read_csv("DATA/spotify-audio-features-final.csv")

**Split data into X feature columns and y label column**

In [3]:
X = df.drop(labels="popularity", axis=1)
y = df["popularity"]

**Transformer**

In [4]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

categorical_features = ["genre"]
numerical_features = X.columns.drop(labels="genre")

transformer = ColumnTransformer([
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(), categorical_features),
])

**Polynomial Converter**

In [5]:
from sklearn.preprocessing import PolynomialFeatures
polynomial_converter = PolynomialFeatures()

**Estimator**

In [6]:
from sklearn.linear_model import ElasticNet
estimator = ElasticNet()

**Pipeline**

In [7]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ("transformer", transformer),
    ("polynomial_converter", polynomial_converter),
    ("estimator", estimator)
])

**Train | Test Split**

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

**Grid Search**

In [9]:
param_grid = {"polynomial_converter__degree":[1, 2],
              "polynomial_converter__include_bias": [True, False],
              "estimator__alpha":[.01, .03, .07, .1, 1, 5, 10, 50, 100],
              "estimator__l1_ratio":[.1, .5, .7, .9, .95, .99, .993, .997, 1],
              "estimator__max_iter": [10000]
              }

In [10]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring="neg_root_mean_squared_error", cv=20)

In [11]:
grid.fit(X_train,y_train)

In [12]:
grid.best_params_

{'estimator__alpha': 0.03,
 'estimator__l1_ratio': 1,
 'estimator__max_iter': 10000,
 'polynomial_converter__degree': 2,
 'polynomial_converter__include_bias': True}

**Evaluation**

In [13]:
from sklearn.metrics import mean_squared_error
RMSE = np.sqrt(mean_squared_error(y_test,grid.predict(X_test)))


In [14]:
RMSE

11.280061894171585

**Saving the model**

In [15]:
grid.fit(X, y)

In [16]:
from joblib import dump
dump(grid, '../model_assets/model.joblib') 

['../model_assets/model.joblib']