# MOUNT GOOGLE DRIVE + CREATE FOLDERS

In [1]:

from google.colab import drive
drive.mount('/content/drive')

import os
BASE_PATH = "/content/drive/MyDrive/7006SCN_project"
os.makedirs(f"{BASE_PATH}/data", exist_ok=True)
os.makedirs(f"{BASE_PATH}/models", exist_ok=True)
os.makedirs(f"{BASE_PATH}/metrics", exist_ok=True)

print("BASE_PATH:", BASE_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
BASE_PATH: /content/drive/MyDrive/7006SCN_project


# INSTALL LIBRARIES

In [2]:
!pip -q install datasets pyspark pyarrow

# LOAD Data

In [3]:
from datasets import load_dataset
import pandas as pd


ds_k = load_dataset("sebastiandizon/genius-song-lyrics", split="train[:100000]")

df = ds_k.to_pandas()
print("Loaded df shape:", df.shape)
print(df.columns)

# Keep only useful columns
keep_cols = ["title", "tag", "artist", "year", "views", "lyrics"]
df = df[keep_cols].dropna(subset=["tag", "lyrics"])

# Optional: reduce noise (remove empty lyrics)
df = df[df["lyrics"].astype(str).str.len() > 0]

print("After cleaning:", df.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loaded df shape: (100000, 11)
Index(['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id',
       'language_cld3', 'language_ft', 'language'],
      dtype='object')
After cleaning: (100000, 6)


# SAVE SAMPLE DATA TO DRIVE (PARQUET + CSV)

In [4]:
raw_parquet = f"{BASE_PATH}/data/raw_sample_100k.parquet"
raw_csv = f"{BASE_PATH}/data/raw_sample_100k.csv"

df.to_parquet(raw_parquet, index=False)
df.to_csv(raw_csv, index=False)

print("Saved Parquet:", raw_parquet)
print("Saved CSV:", raw_csv)

Saved Parquet: /content/drive/MyDrive/7006SCN_project/data/raw_sample_100k.parquet
Saved CSV: /content/drive/MyDrive/7006SCN_project/data/raw_sample_100k.csv


# START SPARK (SAFE CONFIG FOR COLAB)

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("7006SCN_GeniusLyrics_100k") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .getOrCreate()

print("Spark started")

# Load parquet into Spark
sdf = spark.read.parquet(raw_parquet)
print("Spark rows:", sdf.count())
sdf.printSchema()


Spark started
Spark rows: 100000
root
 |-- title: string (nullable = true)
 |-- tag: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- year: long (nullable = true)
 |-- views: long (nullable = true)
 |-- lyrics: string (nullable = true)



# TRAIN/TEST SPLIT

In [6]:
from pyspark.sql.functions import col

sdf = sdf.filter(col("lyrics").isNotNull() & col("tag").isNotNull())
train_df, test_df = sdf.randomSplit([0.8, 0.2], seed=42)

print("Train rows:", train_df.count())
print("Test rows:", test_df.count())

Train rows: 79901
Test rows: 20099


# FEATURE PIPELINE: TAG -> label, LYRICS -> TFIDF FEATURES

In [7]:
from pyspark.ml.feature import (
    StringIndexer, RegexTokenizer, StopWordsRemover,
    HashingTF, IDF
)
from pyspark.ml import Pipeline

label_indexer = StringIndexer(inputCol="tag", outputCol="label", handleInvalid="skip")

tokenizer = RegexTokenizer(inputCol="lyrics", outputCol="tokens", pattern="\\W+")
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")

# HashingTF is scalable and memory-friendly for text
tf = HashingTF(inputCol="filtered_tokens", outputCol="rawFeatures", numFeatures=1 << 18)
idf = IDF(inputCol="rawFeatures", outputCol="features")


# DEFINE 4 REAL ML MODELS (MLlib)

In [11]:
from pyspark.ml.classification import (LogisticRegression, NaiveBayes, RandomForestClassifier)
from pyspark.ml.classification import LinearSVC, OneVsRest

models = {
    "LogisticRegression": LogisticRegression(featuresCol="features", labelCol="label", maxIter=20),
    "NaiveBayes": NaiveBayes(featuresCol="features", labelCol="label", smoothing=1.0),
    "RandomForest": RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100, maxDepth=10),

    "OneVsRest_LinearSVC": OneVsRest(
        classifier=LinearSVC(featuresCol="features", labelCol="label", maxIter=30, regParam=0.1),
        labelCol="label",
        featuresCol="features"
    )
}

# TRAIN + EVALUATE + SAVE MODELS + SAVE METRICS CSV

In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import time

evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator_f1  = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
evaluator_wpr = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
evaluator_wre = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")

results = []

In [13]:
# Cache train/test for repeated reading across models
train_df.cache()
test_df.cache()

for name, clf in models.items():
    print(f"\n==============================")
    print(f"Training: {name}")
    print(f"==============================")

    pipeline = Pipeline(stages=[label_indexer, tokenizer, remover, tf, idf, clf])

    start = time.time()
    fitted = pipeline.fit(train_df)
    train_time = time.time() - start

    preds = fitted.transform(test_df)

    acc = evaluator_acc.evaluate(preds)
    f1  = evaluator_f1.evaluate(preds)
    wpr = evaluator_wpr.evaluate(preds)
    wre = evaluator_wre.evaluate(preds)

    # Save Spark model to Drive
    model_path = f"{BASE_PATH}/models/{name}"
    fitted.write().overwrite().save(model_path)

    # Collect metrics
    results.append({
        "model": name,
        "accuracy": float(acc),
        "f1": float(f1),
        "weighted_precision": float(wpr),
        "weighted_recall": float(wre),
        "train_seconds": float(train_time),
        "train_rows": int(train_df.count()),
        "test_rows": int(test_df.count())
    })

    print(f"{name} metrics:")
    print(f"  accuracy={acc:.4f}")
    print(f"  f1={f1:.4f}")
    print(f"  weighted_precision={wpr:.4f}")
    print(f"  weighted_recall={wre:.4f}")
    print(f"  train_seconds={train_time:.1f}")
    print("Saved model:", model_path)



Training: LogisticRegression
LogisticRegression metrics:
  accuracy=0.8075
  f1=0.8385
  weighted_precision=0.8857
  weighted_recall=0.8075
  train_seconds=307.5
Saved model: /content/drive/MyDrive/7006SCN_project/models/LogisticRegression

Training: NaiveBayes
NaiveBayes metrics:
  accuracy=0.8624
  f1=0.8832
  weighted_precision=0.9162
  weighted_recall=0.8624
  train_seconds=114.4
Saved model: /content/drive/MyDrive/7006SCN_project/models/NaiveBayes

Training: RandomForest
RandomForest metrics:
  accuracy=0.8871
  f1=0.8368
  weighted_precision=0.8201
  weighted_recall=0.8871
  train_seconds=4377.8
Saved model: /content/drive/MyDrive/7006SCN_project/models/RandomForest

Training: OneVsRest_LinearSVC
OneVsRest_LinearSVC metrics:
  accuracy=0.9059
  f1=0.8938
  weighted_precision=0.8886
  weighted_recall=0.9059
  train_seconds=352.0
Saved model: /content/drive/MyDrive/7006SCN_project/models/OneVsRest_LinearSVC
