In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('train.csv')
columns_to_fill = [
    'harmonic_scale_1', 
    'artist_count', 
    'key_variety', 
    'tonal_mode_1', 
    'tonal_mode_0', 
    'time_signature_0'
]

# Fill null values with the median for each specified column
for column in columns_to_fill:
    median_value = df[column].median()
    df[column].fillna(median_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values a

In [3]:

df = df.drop(['id', 'composition_label_0','composition_label_1','track_identifier','creator_collective','composition_label_2','publication_timestamp','weekday_of_release'], axis=1)
df = df.drop(['vocal_presence_0', 'album_name_length','harmonic_scale_0','tonal_mode_0','groove_efficiency_2','groove_efficiency_1','time_signature_0','groove_efficiency_2','beat_frequency_0','beat_frequency_2'], axis=1)

In [4]:
# Fill numeric columns with mean
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill categorical columns with mode
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Verify missing values
print("Number of missing values after filling:\n", df.isna().sum())

Number of missing values after filling:
 emotional_charge_2            0
beat_frequency_1              0
organic_texture_2             0
harmonic_scale_1              0
intensity_index_0             0
duration_ms_0                 0
artist_count                  0
album_component_count         0
emotional_charge_1            0
emotional_charge_0            0
tonal_mode_2                  0
key_variety                   0
performance_authenticity_2    0
performance_authenticity_0    0
season_of_release             0
time_signature_1              0
duration_ms_2                 0
lunar_phase                   0
instrumental_density_2        0
organic_texture_0             0
vocal_presence_2              0
tonal_mode_1                  0
vocal_presence_1              0
intensity_index_1             0
organic_immersion_0           0
instrumental_density_1        0
organic_immersion_2           0
duration_consistency          0
organic_texture_1             0
rhythmic_cohesion_0           0

In [5]:
# Encode categorical columns
label_encoders = {}
for col in categorical_cols:
	le = LabelEncoder()
	df[col] = le.fit_transform(df[col])
	label_encoders[col] = le

X = df.drop('target', axis=1)
y = df['target']


In [None]:
# from sklearn.svm import SVR
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# # Step 1: Initialize and train the SVR model
# svr_model = SVR(kernel='rbf')  # 'rbf' is a common kernel for non-linear data
# svr_model.fit(X_train, y_train)

# # Step 2: Make predictions
# y_pred = svr_model.predict(X_test)

# # Step 3: Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# rmse = np.sqrt(mse)
# r2 = r2_score(y_test, y_pred)

# # Print performance metrics
# print("\nSVR Model Performance:")
# print(f"Mean Squared Error (MSE): {mse:.4f}")
# print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
# print(f"R² Score: {r2:.4f}")

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Step 1: Scale the features (SVR is sensitive to feature scales)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Define the SVR model and hyperparameter grid
svr = SVR()
# param_grid = {
#     'kernel': ['rbf', 'linear'],  # Kernel types
#     'C': [1, 10],           # Regularization parameter
#     'epsilon': [0.1, 0.5]  # Margin of tolerance
# }
param_grid = {
    'kernel': ['rbf'],  # Different kernel types
    'C': [10],  # Regularization parameter
    'epsilon': [0.5]  # Degree of the polynomial kernel function (only for 'poly')
}
# Step 3: Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=svr,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='neg_mean_squared_error',  # Optimize for MSE
    n_jobs=-1  # Use all available CPU cores
)

# Step 4: Fit GridSearchCV to find the best model
grid_search.fit(X_train_scaled, y_train)

# Step 5: Get the best model and its parameters
best_svr = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (Negative MSE):", grid_search.best_score_)

# Step 6: Make predictions with the best model
y_pred = best_svr.predict(X_test_scaled)

# Step 7: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("\nBest SVR Model Performance on Test Set:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")

Best Parameters: {'C': 10, 'epsilon': 0.5, 'kernel': 'rbf'}
Best Cross-Validation Score (Negative MSE): -269.8554148042603

Best SVR Model Performance on Test Set:
Mean Squared Error (MSE): 262.4842
Root Mean Squared Error (RMSE): 16.2014
R² Score: 0.4364


### Test the test data set

Import set data set and pre process it

In [None]:
df = pd.read_csv('test.csv')
columns_to_fill = [
    'harmonic_scale_1', 
    'artist_count', 
    'key_variety', 
    'tonal_mode_1', 
    'tonal_mode_0', 
    'time_signature_0'
]

# Fill null values with the median for each specified column
for column in columns_to_fill:
    median_value = df[column].median()
    df[column].fillna(median_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values a

Remove Uncorellated columns and unnecessary columns

In [None]:
ids = df['id']
df = df.drop(['id', 'composition_label_0','composition_label_1','track_identifier','creator_collective','composition_label_2','publication_timestamp','weekday_of_release'], axis=1)
df = df.drop(['vocal_presence_0', 'album_name_length','harmonic_scale_0','tonal_mode_0','groove_efficiency_2','groove_efficiency_1','time_signature_0','groove_efficiency_2','beat_frequency_0','beat_frequency_2'], axis=1)

In [None]:
# Fill numeric columns with mean
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill categorical columns with mode
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Verify missing values
print("Number of missing values after filling:\n", df.isna().sum())

Number of missing values after filling:
 emotional_charge_2            0
beat_frequency_1              0
organic_texture_2             0
harmonic_scale_1              0
intensity_index_0             0
duration_ms_0                 0
artist_count                  0
album_component_count         0
emotional_charge_1            0
emotional_charge_0            0
tonal_mode_2                  0
key_variety                   0
performance_authenticity_2    0
performance_authenticity_0    0
season_of_release             0
time_signature_1              0
duration_ms_2                 0
lunar_phase                   0
instrumental_density_2        0
organic_texture_0             0
vocal_presence_2              0
tonal_mode_1                  0
vocal_presence_1              0
intensity_index_1             0
organic_immersion_0           0
instrumental_density_1        0
organic_immersion_2           0
duration_consistency          0
organic_texture_1             0
rhythmic_cohesion_0           0

In [None]:
# Encode categorical columns
label_encoders = {}
for col in categorical_cols:
	le = LabelEncoder()
	df[col] = le.fit_transform(df[col])
	label_encoders[col] = le
	


In [None]:
# Ensure test data columns match training data columns
df_aligned = df[X_train.columns]

y_pred = best_svr.predict(df_aligned)
submission = pd.DataFrame({'id': ids, 'target': y_pred})
submission.to_csv('submission_svr.csv', index=False)
print("Submission file created: submission_svr.csv")



Submission file created: submission_svr.csv
