In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

train = pd.read_csv('/kaggle/input/spotify-song-details/train.csv')
test = pd.read_csv('/kaggle/input/spotify-song-details/test.csv')

In [None]:
# Separating features (X) and target (y) in the training data
X = train.drop(['popularity'], axis=1)
y = train['popularity']

In [None]:
# Define categorical columns for label encoding
categorical_columns = ['track_genre', 'key', 'mode']

# Function to label encode categorical columns
def label_encode(X):
    le = LabelEncoder()
    for column in categorical_columns:
        X[column] = le.fit_transform(X[column])
    return X

In [None]:
# Create a custom transformer for label encoding
class LabelEncoderTransformer:
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return label_encode(X.copy())

In [None]:
# Split the data into training and validation sets
X, X_val, y, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_test = test

In [None]:
# preparing the validation set
X_val = X.sample(frac=0.2, random_state=42)
y_val = y[X_val.index]
X_train = X.drop(X_val.index)
y_train = y.drop(X_val.index)

In [None]:
# 1. Encode boolean column 'explicit'
X_train['explicit'] = X_train['explicit'].astype(int)
X_val['explicit'] = X_val['explicit'].astype(int)
X_test['explicit'] = X_test['explicit'].astype(int)

# Drop unnecessary columns from training and validation data
X_train = X_train.drop(['track_id', 'artists', 'album_name', 'track_name'], axis=1)
X_val = X_val.drop(['track_id', 'artists', 'album_name', 'track_name'], axis=1)
X_test = X_test.drop(['track_id', 'artists', 'album_name', 'track_name'], axis=1)

label_encode(X_train)
label_encode(X_val)

In [None]:
# Aligning train and validation sets in case of missing columns
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)

In [None]:
X_train.info()

In [None]:
# Training the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Predicting on validation set
predictions = model.predict(X_val)

In [None]:
# Calculating validation MSE
mse = mean_squared_error(y_val, predictions)
print(f"Validation MSE: {mse}")