We begin by loading the dataset. Ensure the CSV file is located in the correct path relative to the notebook.

In [7]:
# Machine Learning - Assignment 2
# Section B: Data Pre-processing

import pandas as pd
import numpy as np

# Example: Load your dataset (edit path as needed)
df = pd.read_csv("../data/Spotify_Youtube.csv")

We create several new features based on relevant aggregations and domain knowledge.

In [9]:
# --- Feature Engineering ---

# 1. Album Song Count
album_song_count = df.groupby('Album')['Track'].transform('count')
df['Album_Song_Count'] = album_song_count
# ✅ Covered in class: grouping & aggregation (04-Data-Preprocessing)


In [10]:
artist_total_views = df.groupby('Artist')['Views'].transform('sum')
artist_song_count = df.groupby('Artist')['Track'].transform('count')
df['Artist_Avg_Views'] = artist_total_views / artist_song_count

In [11]:
df['Song_Name_Length'] = df['Track'].str.split().str.len()
album_total_duration = df.groupby('Album')['Duration_ms'].transform('sum')
df['Total_Album_Length'] = album_total_duration

In [12]:
df['Name_Contains_Number'] = df['Track'].apply(lambda x: any(char.isdigit() for char in str(x)))

In [8]:
df.isna().sum()

Unnamed: 0            0
Artist                0
Url_spotify           0
Track                 0
Album                 0
Album_type            0
Uri                   0
Danceability          2
Energy                2
Key                   2
Loudness              2
Speechiness           2
Acousticness          2
Instrumentalness      2
Liveness              2
Valence               2
Tempo                 2
Duration_ms           2
Url_youtube         470
Title               470
Channel             470
Views               470
Likes               541
Comments            569
Description         876
Licensed            470
official_video      470
Stream              576
dtype: int64

In [13]:

# 5. Fitness for Clubs
# Normalize loudness first (Loudness range is wide)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df['Loudness_Scaled'] = scaler.fit_transform(df[['Loudness']])
df['Fitness_for_Clubs'] = df[['Danceability', 'Energy', 'Valence', 'Loudness_Scaled']].mean(axis=1)
# ✅ Learned: Normalization, feature combination (04-Data-Preprocessing)

# 6. Log Views (Handling outliers)
df['Log_Views'] = np.log1p(df['Views'])
# ✅ Transformation technique taught in class

# 8. Has Featured Artist
import re
df['Has_Featured_Artist'] = df['Artist'].apply(lambda x: bool(re.search(r"feat\.|ft\.|Feat\.", str(x))))

# 9. Tempo Binned
bins = [0, 90, 130, np.inf]
labels = ['Slow', 'Medium', 'Fast']
df['Tempo_Bin'] = pd.cut(df['Tempo'], bins=bins, labels=labels)

In [14]:
# --- Imputation ---
# Example: Fill missing numeric columns with median
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
# ✅ Statistical imputation: taught in class (mean/median) (04-Data-Preprocessing)


Missing values are handled using statistical median imputation for numeric columns.

In [15]:
# --- Transformation ---
# Already applied: log1p transform on Views
# Could also apply StandardScaler later during modeling

# --- Exclusion ---
# Example: Drop URL columns
df.drop(columns=['Url_spotify', 'Url_youtube', 'Description'], inplace=True, errors='ignore')
# ✅ Covered in class: dropping uninformative features

import os
os.makedirs("output", exist_ok=True)
# Export for later use
df.to_csv("output/Processed_Spotify_Youtube.csv", index=False)