# Lab 2

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split

## Load data

In [None]:
spotify_song_df = pd.read_csv("data/Spotify_Song_Attributes.csv")

In [None]:
spotify_song_df.dtypes

In [None]:
spotify_song_df.info()

## Data Wrangling

In [None]:
Z = spotify_song_df["energy"].to_numpy()
Z_mean = np.nanmean(Z)
Z_shifted = Z - Z_mean

Y = spotify_song_df["loudness"].to_numpy()
Y_mean = np.nanmean(Y)
Y_shifted = Y - Y_mean

In [None]:
plt.figure(figsize=(12, 6), dpi=100)
sns.scatterplot(x=Z, y=Y)
plt.title("Scatter Plot Energy and Loudness")
plt.scatter(
    Z_mean, Y_mean, color="red", s=50, marker="X", label="Mean (Energy, Loudness)"
)
plt.xlabel("Energy")
plt.ylabel("Loudness")
plt.legend()
plt.grid(True)

In [None]:
plt.figure(figsize=(12, 6), dpi=100)
sns.scatterplot(x=Z_shifted, y=Y_shifted)
plt.title("Scatter Plot Energy and Loudness (Mean Shifted)")
plt.xlabel("Energy (Mean Shifted)")
plt.ylabel("Loudness (Mean Shifted)")
plt.grid(True)

## LR

### `Loudness ~ Energy` With Shifting

In [None]:
energy_loudness_subset_df = spotify_song_df[["energy", "loudness"]].dropna()

In [None]:
energy_loudness_subset_df["energy_shifted"] = (
    energy_loudness_subset_df["energy"] - energy_loudness_subset_df["energy"].mean()
)
energy_loudness_subset_df["loudness_shifted"] = (
    energy_loudness_subset_df["loudness"] - energy_loudness_subset_df["loudness"].mean()
)

In [None]:
res_shifted = stats.linregress(
    x=energy_loudness_subset_df["energy_shifted"],
    y=energy_loudness_subset_df["loudness_shifted"],
)
print(res_shifted.intercept, res_shifted.slope)

### `Loudness ~ Energy` Without Shifting

In [None]:
res_not_shifted = stats.linregress(
    x=energy_loudness_subset_df["energy"],
    y=energy_loudness_subset_df["loudness"],
)
print(res_not_shifted.intercept, res_not_shifted.slope)

* The slope is the same after not shifting the mean.
* But the intercepts are different. The shifted one has an intercept close to zero.

### `tempo ~ acousticness` With Shifting

In [None]:
acousticness_tempo_df = spotify_song_df[["acousticness", "tempo"]].dropna()
acousticness_tempo_df["acousticness_shifted"] = (
    acousticness_tempo_df["acousticness"] - acousticness_tempo_df["acousticness"].mean()
)
acousticness_tempo_df["tempo_shifted"] = (
    acousticness_tempo_df["tempo"] - acousticness_tempo_df["tempo"].mean()
)

In [None]:
res_shifted = stats.linregress(
    x=acousticness_tempo_df["acousticness_shifted"],
    y=acousticness_tempo_df["tempo_shifted"],
)
print(res_shifted.intercept, res_shifted.slope)

### `tempo ~ acousticness` Without Shifting

In [None]:
res_shifted = stats.linregress(
    x=acousticness_tempo_df["acousticness"],
    y=acousticness_tempo_df["tempo"],
)
print(res_shifted.intercept, res_shifted.slope)

* Similarly, here, the slopes are the same but the intercepts are different.
* The mean shifted one has an intercept closer to zero.

## Splitting Data

In [None]:
Z_train, Z_test, Y_train, Y_test = train_test_split(
    energy_loudness_subset_df["energy"],
    energy_loudness_subset_df["loudness"],
    train_size=0.33,
    random_state=49,
)

In [None]:
res = stats.linregress(
    x=Z_train,
    y=Y_train,
)
print(res.intercept, res.slope)

Splitting the data didn't change the result as the distribution didn't change.

In [None]:
Y_pred = res.slope * Z_test + res.intercept

MSPE:

In [None]:
np.sum((Y_pred - Y_test) ** 2) / Y_pred.shape[0]