# Regression Model

**Imports**

In [1]:
import numpy as np
import pandas as pd

**Load the dataset**

In [2]:
df = pd.read_csv("DATA/spotify-audio-features-final.csv")

**Split data into X feature columns and y label column**

In [3]:
X = df.drop(labels="popularity", axis=1)
y = df["popularity"]

**Transformer**

In [4]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

categorical_features = ["genre"]
numerical_features = X.columns.drop(labels="genre")

transformer = ColumnTransformer([
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(), categorical_features),
])

**Polynomial Converter**

In [5]:
from sklearn.preprocessing import PolynomialFeatures
polynomial_converter = PolynomialFeatures(degree=2, include_bias=False)

**Estimator**

In [6]:
from sklearn.linear_model import ElasticNet
estimator = ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=10000)

**Pipeline**

In [7]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ("transformer", transformer),
    ("polynomial_converter", polynomial_converter),
    ("estimator", estimator)
])

**Train | Test Split**

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

pipe.fit(X_train, y_train)

**Evaluation**

In [9]:
test_predictions = pipe.predict(X_test)
train_predictions = pipe.predict(X_train)
from sklearn.metrics import mean_squared_error
RMSE_train =np.sqrt(mean_squared_error(y_train,train_predictions))
RMSE_test = np.sqrt(mean_squared_error(y_test,test_predictions))

In [10]:
RMSE_train

11.325604376403815

In [11]:
RMSE_test

11.29922886021701

**Saving the model**

In [12]:
from joblib import dump
dump(pipe, '../model_assets/model.joblib') 

['../model_assets/model.joblib']