In [7]:
# --- Imports and data load ---
import pandas as pd
import numpy as np

df = pd.read_csv('../data/Spotify_Youtube.csv')  # Adjust if needed
print("Dataset shape:", df.shape)
print("\nColumn types:\n", df.dtypes.value_counts())
print("\nMissing values (top):\n", df.isna().sum().sort_values(ascending=False).head(10))
print("\nTotal rows:", len(df))

Dataset shape: (20718, 28)

Column types:
 float64    15
object     12
int64       1
Name: count, dtype: int64

Missing values (top):
 Description       876
Stream            576
Comments          569
Likes             541
official_video    470
Licensed          470
Views             470
Channel           470
Title             470
Url_youtube       470
dtype: int64

Total rows: 20718


In [8]:
# --- Drop rows missing Youtube or Spotify info ---
before = df.shape[0]
df = df.dropna(subset=['Url_spotify', 'Url_youtube'])
print("Removed", before - df.shape[0], "rows with missing Youtube or Spotify")

Removed 470 rows with missing Youtube or Spotify


In [9]:
# --- Target creation ---
df['Target'] = df['Album_type'].map({'single': 1, 'album': 0, 'compilation': 0})
print("\nTarget distribution:\n", df['Target'].value_counts(normalize=True))


Target distribution:
 Target
0    0.759976
1    0.240024
Name: proportion, dtype: float64


In [10]:
# --- Feature engineering (לפני השלכת עמודות) ---

# Normalize Loudness
df['Loudness_norm'] = (df['Loudness'] - df['Loudness'].min()) / (df['Loudness'].max() - df['Loudness'].min())

# Fitness for Clubs (based on normalized loudness)
df['Fitness_for_Clubs'] = df[['Danceability', 'Energy', 'Valence']].mean(axis=1) + df['Loudness_norm']

# Song Name Length
df['Song_Name_Length'] = df['Track'].astype(str).str.split().str.len()

# Key Code
df['Key_Code'] = df['Key'].astype('category').cat.codes
df.drop(columns=['Key'], inplace=True)

In [11]:
# --- Drop non-numeric columns except 'Album_type' which was mapped ---
drop_cols = ['Artist', 'Url_spotify', 'Track', 'Album', 'Uri', 'Url_youtube', 'Title', 
             'Channel', 'Description', 'Licensed', 'official_video']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

In [13]:
# --- Final structure check ---
X = df.drop(columns=['Target'])
y = df['Target']
print(X.shape, y.shape)
print("\nNumeric types:\n", X.dtypes.value_counts())

(20248, 20) (20248,)

Numeric types:
 float64    16
int64       2
object      1
int8        1
Name: count, dtype: int64


In [14]:
df.to_csv("../output/B_cleaned_ready_for_modeling.csv", index=False)