## Imports

In [None]:
import os, sys
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd

load_dotenv()

DATA_PATH = Path(os.getenv("DATA_PATH"))

# only for .ipynb because relative imports don't work
root_path = (DATA_PATH.parent) 
os.chdir(str(root_path))

import src.training.plotting as p
import src.training.pre_training as t
import src.training.postprocessing as pp

from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt


## Prepare Data

In [None]:
df = t.get_music_df()

In [None]:
plot_list = []

In [None]:
y_original = df["popularity"]
y_classes = df["popularity"].apply(t.encode_popularity)
X = df.drop("popularity", axis=1)

## Distribution of Popularity

In [None]:
# RUS sampled and encoded popularity
X_1, y_1 = RandomUnderSampler(random_state=42).fit_resample(X, y_classes)

# Include: Unsampled raw / encoded and RUS encoded popularity
plot_list.extend([
    (plt.bar, {"x": list(set(y_classes)), "height": pp.count_distribution(y_classes)},
    "popularity", "count", "Distribution of Popularity classes"),
    (plt.bar, {"x": list(set(y_original)), "height": pp.count_distribution(y_original)},
    "popularity", "count", "Distribution of Popularity raw"),
    (plt.bar, {"x": list(set(y_1)), "height": pp.count_distribution(y_1)},
    "popularity", "count", "Distribution of Popularity classes with RUS")]
)

## Features vs. Popularity

In [None]:
# Add each feature plot to list
for name, values in pd.DataFrame(X).items():
    plot_list.extend(
        [
        # (plt.scatter, {"x": values, "y": y_classes},
        # name, "encoded popularity", f"Distribution of {name}"),
        (plt.scatter, {"x": values, "y": y_original},
        name, "popularity", f"Distribution of {name}")
        ]
    )

## Generate Plots

In [None]:
print("Num plots", len(plot_list))
p.plots_from_list("Analysis of popularity regarding all features.",
    plot_list, "Popularity Analysis", save=True, model_type="all")