Import Libraries

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder


In [16]:
df = pd.read_csv('train.csv')

In [17]:
df = df.drop(['id', 'composition_label_0','composition_label_1','track_identifier','creator_collective','composition_label_2','publication_timestamp','weekday_of_release'], axis=1)

In [18]:
df = df.drop(['vocal_presence_0', 'album_name_length','harmonic_scale_0','tonal_mode_0','groove_efficiency_2','groove_efficiency_1','time_signature_0','groove_efficiency_2','beat_frequency_0','beat_frequency_2'], axis=1)

In [19]:
# Fill numeric columns with mean
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill categorical columns with mode
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Verify missing values
print("Number of missing values after filling:\n", df.isna().sum())

Number of missing values after filling:
 emotional_charge_2            0
beat_frequency_1              0
organic_texture_2             0
harmonic_scale_1              0
intensity_index_0             0
duration_ms_0                 0
artist_count                  0
album_component_count         0
emotional_charge_1            0
emotional_charge_0            0
tonal_mode_2                  0
key_variety                   0
performance_authenticity_2    0
performance_authenticity_0    0
season_of_release             0
time_signature_1              0
duration_ms_2                 0
lunar_phase                   0
instrumental_density_2        0
organic_texture_0             0
vocal_presence_2              0
tonal_mode_1                  0
vocal_presence_1              0
intensity_index_1             0
organic_immersion_0           0
instrumental_density_1        0
organic_immersion_2           0
duration_consistency          0
organic_texture_1             0
rhythmic_cohesion_0           0

In [20]:
df.head()

Unnamed: 0,emotional_charge_2,beat_frequency_1,organic_texture_2,harmonic_scale_1,intensity_index_0,duration_ms_0,artist_count,album_component_count,emotional_charge_1,emotional_charge_0,...,groove_efficiency_0,emotional_resonance_2,duration_ms_1,time_signature_2,rhythmic_cohesion_2,emotional_resonance_0,harmonic_scale_2,intensity_index_2,instrumental_density_0,target
0,0.48285,80.018,0.0201,1.0,0.789,154586.0,2.10703,6.0,0.64068,0.478923,...,1.150146,0.666,161853.0,4.0,0.612252,0.607,7.0,0.725,0.0,74
1,0.267862,147.966,0.334,6.0,0.715,46874.0,2.0,3.0,0.2552,0.559845,...,2.718967,0.361,155619.0,4.0,0.843,0.783,4.0,0.616045,0.0432,2
2,0.242606,142.98,0.111,4.0,0.604426,264665.0,2.0,14.0,0.456576,0.148544,...,1.270758,0.403,209378.0,4.0,0.612252,0.211,10.0,0.602,0.0,35
3,0.4264,123.063,0.196,5.0,0.685,209208.0,2.0,15.0,0.51834,0.252765,...,1.145485,0.52,219043.0,4.0,0.702,0.369,5.288894,0.82,0.000335,70
4,0.0,132.722,0.0811,6.0,0.856,215346.0,2.0,17.0,0.611499,0.540136,...,1.124836,0.0,258893.0,0.0,0.0,0.631,1.0,0.0221,0.0,78


In [21]:
# Encode categorical columns
label_encoders = {}
for col in categorical_cols:
	le = LabelEncoder()
	df[col] = le.fit_transform(df[col])
	label_encoders[col] = le

X = df.drop('target', axis=1)
y = df['target']

# Step 2: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 4: Make Predictions and Evaluate
y_pred = rf_model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nModel Performance:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")




Model Performance:
Mean Squared Error (MSE): 129.3348
Root Mean Squared Error (RMSE): 11.3725
R² Score: 0.7223


Predicting Model

In [22]:
df = pd.read_csv('test.csv')
df = df.drop(['id', 'composition_label_0','composition_label_1','track_identifier','creator_collective','composition_label_2','publication_timestamp','weekday_of_release'], axis=1)
df = df.drop(['vocal_presence_0', 'album_name_length','harmonic_scale_0','tonal_mode_0','groove_efficiency_2','groove_efficiency_1','time_signature_0','groove_efficiency_2','beat_frequency_0','beat_frequency_2'], axis=1)
# Fill numeric columns with mean 
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill categorical columns with mode
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Verify missing values
print("Number of missing values after filling:\n", df.isna().sum())

Number of missing values after filling:
 emotional_charge_2            0
beat_frequency_1              0
organic_texture_2             0
harmonic_scale_1              0
intensity_index_0             0
duration_ms_0                 0
artist_count                  0
album_component_count         0
emotional_charge_1            0
emotional_charge_0            0
tonal_mode_2                  0
key_variety                   0
performance_authenticity_2    0
performance_authenticity_0    0
season_of_release             0
time_signature_1              0
duration_ms_2                 0
lunar_phase                   0
instrumental_density_2        0
organic_texture_0             0
vocal_presence_2              0
tonal_mode_1                  0
vocal_presence_1              0
intensity_index_1             0
organic_immersion_0           0
instrumental_density_1        0
organic_immersion_2           0
duration_consistency          0
organic_texture_1             0
rhythmic_cohesion_0           0

In [None]:
# Encode categorical columns
label_encoders = {}
for col in categorical_cols:
	le = LabelEncoder()
	df[col] = le.fit_transform(df[col])
	label_encoders[col] = le