<a href="https://colab.research.google.com/github/kritikaamohan/music_popularity_prediction/blob/main/music_popularity_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/Spotify_data.csv')
print(df.head())

In [None]:
display(df.head())

In [4]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
display(df.head())

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Track Name        227 non-null    object 
 1   Artists           227 non-null    object 
 2   Album Name        227 non-null    object 
 3   Album ID          227 non-null    object 
 4   Track ID          227 non-null    object 
 5   Popularity        227 non-null    int64  
 6   Release Date      227 non-null    object 
 7   Duration (ms)     227 non-null    int64  
 8   Explicit          227 non-null    bool   
 9   External URLs     227 non-null    object 
 10  Danceability      227 non-null    float64
 11  Energy            227 non-null    float64
 12  Key               227 non-null    int64  
 13  Loudness          227 non-null    float64
 14  Mode              227 non-null    int64  
 15  Speechiness       227 non-null    float64
 16  Acousticness      227 non-null    float64
 1

In [7]:
print(df.columns)

Index(['Track Name', 'Artists', 'Album Name', 'Album ID', 'Track ID',
       'Popularity', 'Release Date', 'Duration (ms)', 'Explicit',
       'External URLs', 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode',
       'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness',
       'Valence', 'Tempo'],
      dtype='object')


In [8]:
df.describe()

Unnamed: 0,Popularity,Duration (ms),Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo
count,227.0,227.0,227.0,227.0,227.0,227.0,227.0,227.0,227.0,227.0,227.0,227.0,227.0
mean,71.85022,219254.881057,0.635639,0.646665,5.45815,-6.51667,0.678414,0.079576,0.37506,0.02889,0.177797,0.472441,119.466361
std,10.2411,60483.492317,0.155123,0.15915,3.760738,2.099543,0.468117,0.0851,0.300084,0.137225,0.121366,0.193902,26.154889
min,13.0,96947.0,0.271,0.236,0.0,-15.073,0.0,0.0246,0.000307,0.0,0.0297,0.0385,61.311
25%,68.0,170554.5,0.552,0.5395,2.0,-7.83,0.0,0.0338,0.065,0.0,0.101,0.3245,95.4575
50%,72.0,222462.0,0.634,0.655,6.0,-6.346,1.0,0.0421,0.393,2e-06,0.127,0.462,122.925
75%,78.0,265611.0,0.746,0.7635,9.0,-5.0805,1.0,0.0812,0.6305,0.00017,0.219,0.595,137.952
max,96.0,383639.0,0.948,0.972,11.0,-0.424,1.0,0.491,0.94,0.901,0.79,0.972,187.629


In [None]:
df.isnull().sum()

In [10]:
df.duplicated().sum()

np.int64(15)

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
features= ['Energy', 'Valence', 'Danceability', 'Loudness', 'Acousticness']

In [None]:
for feature in features:
  plt.figure(figsize=(10,5))
  sns.scatterplot(data= df, x=feature, y='Popularity')
  plt.title(f'Popularity Vs {feature}')
  plt.show()

In [14]:
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
numeric_data = df[numeric_columns]
#Pick only the columns that have numeric data â€” either integers (whole numbers) or floats (decimal numbers)

In [15]:
corr_matrix = numeric_data.corr() #Correlation between all the features

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [17]:
#popularity has a moderate positive correlation with loudness (0.31) and danceability (0.25), indicating that louder and more danceable tracks tend to be more popular.
#There is a moderate negative correlation between popularity and acousticness (-0.43), suggesting that tracks with higher acousticness are generally less popular.
#Energy also has a positive correlation with popularity (0.25).

In [None]:
#Distribution of all the features
for feature in features:
    plt.figure(figsize=(8, 5))
    sns.histplot(df[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.show()

In [19]:
#Feature Selection : Energy ,Valence, Danceability, Loudness ,Acousticness ,Tempo ,Speechiness, Liveness
#These features capture various audio characteristics that influence the popularity of music tracks.

In [20]:
#Model Training
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [21]:
#Selecting features and the target variable
features = ['Energy', 'Valence', 'Danceability', 'Loudness', 'Acousticness', 'Tempo', 'Speechiness', 'Liveness']
X= df[features]
y= df['Popularity']

In [22]:
#Split data into two ; training = 80% to teach the model & testing = 20% to check if it learned well
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
#Normalizing all features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
#Parameter grid of the Random Forest
param_grid = {
    'n_estimators': [50,100,200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
# Grid search with 5-fold CV
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid,verbose=2, refit= True, cv=5)
grid_search_rf.fit(X_train_scaled, y_train)

# Get best model and predict
best_rf_model = grid_search_rf.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test_scaled)

In [None]:
# making predictions
y_pred_best_rf = best_rf_model.predict(X_test_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_best_rf, alpha=0.7)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2)
plt.xlabel('Actual Popularity')
plt.ylabel('Predicted Popularity')
plt.title('Actual Vs Predicted Popularity (Best Random Forest Model)')
plt.show()