# hit-song-classifier
## Random Forest Classification

The first thing we'll do is define our imports and declare any constants.

In [1]:
# Define our imports
import numpy as np
import sklearn

from pyspark.sql import DataFrame, SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Define our constants
TARGET_DATA = "../../data/join_datasets1.csv"
RANDOM_SEED = 0
TARGET_FEATURES = [
    "danceability",
    "duration",
    "energy",
    "key",
    "loudness",
    "song_hotttnesss",
    "tempo",
    "time_signature",
]


Next, we shall split our data in a 60-20-20 train_test_val split. Then we shall build our set of inputs X. We don't want to include the features `[song, artist, year]` since they are keys. We also should define our label `class` since that is our classification feature.

In [2]:
def feature_transformer(df: DataFrame) -> DataFrame:
    """Assembles feature vectors in dataframe

    Args:
        df (DataFrame): input

    Returns:
        DataFrame: feature-label dataframe
    """
    va = VectorAssembler(inputCols=TARGET_FEATURES, outputCol="features")
    va_df = va.transform(df)
    return va_df


def load_data(filename: str) -> DataFrame:
    """Load data from file

    Args:
        filename (str): filename

    Returns:
        DataFrame: pyspark dataframe
    """
    spark = SparkSession.builder.appName("hit-song-classifier-rfc").getOrCreate()
    return spark.read.csv(filename, header=True, inferSchema=True)


def split_data(df: DataFrame) -> tuple[DataFrame, DataFrame, DataFrame]:
    """Splits data to train, val, test splits

    Args:
        df (DataFrame): full dataframe

    Returns:
        DataFrame: _description_
    """
    train, test = df.randomSplit([0.8, 0.2], seed=RANDOM_SEED)
    train, val = train.randomSplit([0.8, 0.2], seed=RANDOM_SEED)
    return train, val, test


df = feature_transformer(load_data(TARGET_DATA))
train_df, val_df, test_df = split_data(df)
print(f"Train: {train_df.count()}\nTest: {test_df.count()}\nVal: {val_df.count()}")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/25 22:38:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Train: 57
Test: 16
Val: 14


## Hyperparameter search: training multiple estimators

Here we will train multiple estimators with different hyperparameter settings. For RandomForestClassifiers, the hyperparameters we've chosen include `maxDepth` and `numTrees`.

In [3]:
def train_estimators(data: DataFrame, estimator_type: any, param_name: str, param_vals: list[int], **kwargs) -> list:
    """Trains estimators

    Args:
        data (DataFrame): dataframe
        estimator_type (RandomForestClassifier): estimator
        param_name (str): parameter name
        param_vals (list[int]): parameter values

    Returns:
        list: list of trained estimators
    """
    estimators = []
    for val in param_vals:
        estimator = estimator_type(**{param_name: val}, **kwargs)
        estimator.fit(data)
        estimators.append(estimator)
        print(f"Training {estimator}")
    return estimators

In [6]:
max_depth = [1, 5, 10, 20, 30]
rfc_multi = train_estimators(train_df, RandomForestClassifier, "maxDepth", max_depth, featuresCol="features", labelCol="class", seed=RANDOM_SEED)
print(rfc_multi)

Training RandomForestClassifier_9ddf2e880a91
Training RandomForestClassifier_8e56f4c54f4c
Training RandomForestClassifier_10f45c4441e1
Training RandomForestClassifier_0fa5a10a5dd8
Training RandomForestClassifier_203161100a23
[RandomForestClassifier_9ddf2e880a91, RandomForestClassifier_8e56f4c54f4c, RandomForestClassifier_10f45c4441e1, RandomForestClassifier_0fa5a10a5dd8, RandomForestClassifier_203161100a23]


We want to plot estimator scores to pick the estimator with the best score.

In [5]:
def score_estimators(estimators: list, data: DataFrame) -> list:
    """Scores estimators

    Args:
        estimators (list): list of estimators
        data (DataFrame): dataframe

    Returns:
        list: list of scores
    """
    evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="accuracy")

    predicted = [estimator.transform(data) for estimator in estimators]
    return [evaluator.evaluate(pred) for pred in predicted]

print(f"train: {score_estimators(rfc_multi, train_df)}")
print(f"val: {score_estimators(rfc_multi, val_df)}")
print(f"test: {score_estimators(rfc_multi, test_df)}")

AttributeError: 'RandomForestClassifier' object has no attribute 'transform'