In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from src.classes.EagleEye import EagleEye

---

## Importing and Exploring the data

In [2]:
# Load the CSV file
file_path = 'Resources/dataset.csv'  # Replace with your actual file path if different
data = pd.read_csv(file_path)

data.head()


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [3]:
data.shape

(114000, 21)

In [4]:
for column in data.columns:
    print(column)

Unnamed: 0
track_id
artists
album_name
track_name
popularity
duration_ms
explicit
danceability
energy
key
loudness
mode
speechiness
acousticness
instrumentalness
liveness
valence
tempo
time_signature
track_genre


In [5]:
print(data.dtypes)

Unnamed: 0            int64
track_id             object
artists              object
album_name           object
track_name           object
popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre          object
dtype: object



---

## Cleaning the Data


In [6]:
# Unamed: 0 Column appears to be unused. Dropping it.
data_cleaned = data.drop(columns=['Unnamed: 0'])

In [7]:
#Checking for rows with nulls

null_rows = data_cleaned[data_cleaned.isnull().any(axis=1)]

# Display or use the resulting DataFrame with null values
null_rows

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
65900,1kR4gIb7nGxHPI3D2ifs59,,,,0,0,False,0.501,0.583,7,-9.46,0,0.0605,0.69,0.00396,0.0747,0.734,138.391,4,k-pop


In [8]:
data_cleaned = data_cleaned.dropna()

In [9]:
missing_values_count = data_cleaned.isnull().sum()
missing_values_count

track_id            0
artists             0
album_name          0
track_name          0
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

In [10]:
#We want to drop duplicate songs, so unique songs are not represented more than once in the data

# Drop rows with duplicate track_id values
data_cleaned.drop_duplicates(subset=['track_id'], keep='first', inplace=True)

In [11]:
data_cleaned.shape

(89740, 20)


---

## Scaling the Data


In [12]:
ranges = {}

# Iterate through each column in the DataFrame
for column in data_cleaned.columns:
    # Check if the column is of float or integer type
    if data_cleaned[column].dtype in ['float64', 'int64']:
        # Calculate range of values
        min_val = data_cleaned[column].min()
        max_val = data_cleaned[column].max()
        
        # Store range and data type in dictionary
        ranges[column] = (min_val, max_val, data_cleaned[column].dtype)

# Print the range of values for each float or integer column
for column, (min_val, max_val, data_type) in ranges.items():
    print(f"Data Type: {data_type}, Column: {column}, Range: ({min_val}, {max_val})")

Data Type: int64, Column: popularity, Range: (0, 100)
Data Type: int64, Column: duration_ms, Range: (8586, 5237295)
Data Type: float64, Column: danceability, Range: (0.0, 0.985)
Data Type: float64, Column: energy, Range: (0.0, 1.0)
Data Type: int64, Column: key, Range: (0, 11)
Data Type: float64, Column: loudness, Range: (-49.531, 4.532)
Data Type: int64, Column: mode, Range: (0, 1)
Data Type: float64, Column: speechiness, Range: (0.0, 0.965)
Data Type: float64, Column: acousticness, Range: (0.0, 0.996)
Data Type: float64, Column: instrumentalness, Range: (0.0, 1.0)
Data Type: float64, Column: liveness, Range: (0.0, 1.0)
Data Type: float64, Column: valence, Range: (0.0, 0.995)
Data Type: float64, Column: tempo, Range: (0.0, 243.372)
Data Type: int64, Column: time_signature, Range: (0, 5)


In [13]:
# Columns to scale
columns_to_scale = ['duration_ms', 'key', 'loudness', 'tempo', 'time_signature']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the specified columns
data_cleaned[columns_to_scale] = scaler.fit_transform(data_cleaned[columns_to_scale])

data_cleaned.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,0.042473,False,0.676,0.461,0.090909,0.791392,0,0.143,0.0322,1e-06,0.358,0.715,0.361245,0.8,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,0.026971,False,0.42,0.166,0.090909,0.597377,1,0.0763,0.924,6e-06,0.101,0.267,0.318397,0.8,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,0.038679,False,0.438,0.359,0.0,0.736123,1,0.0557,0.21,0.0,0.117,0.12,0.313643,0.8,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,0.036978,False,0.266,0.0596,0.0,0.573701,1,0.0363,0.905,7.1e-05,0.132,0.143,0.746758,0.6,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,0.036389,False,0.618,0.443,0.181818,0.737103,1,0.0526,0.469,0.0,0.0829,0.167,0.492863,0.8,acoustic


In [14]:
#Check the data ranges to see if it is scaled

ranges = {}

# Iterate through each column in the DataFrame
for column in data_cleaned.columns:
    # Check if the column is of float or integer type
    if data_cleaned[column].dtype in ['float64', 'int64']:
        # Calculate range of values
        min_val = data_cleaned[column].min()
        max_val = data_cleaned[column].max()
        
        # Store range and data type in dictionary
        ranges[column] = (min_val, max_val, data_cleaned[column].dtype)

# Print the range of values for each float or integer column
for column, (min_val, max_val, data_type) in ranges.items():
    print(f"Data Type: {data_type}, Column: {column}, Range: ({min_val}, {max_val})")

Data Type: int64, Column: popularity, Range: (0, 100)
Data Type: float64, Column: duration_ms, Range: (0.0, 1.0)
Data Type: float64, Column: danceability, Range: (0.0, 0.985)
Data Type: float64, Column: energy, Range: (0.0, 1.0)
Data Type: float64, Column: key, Range: (0.0, 1.0)
Data Type: float64, Column: loudness, Range: (0.0, 0.9999999999999999)
Data Type: int64, Column: mode, Range: (0, 1)
Data Type: float64, Column: speechiness, Range: (0.0, 0.965)
Data Type: float64, Column: acousticness, Range: (0.0, 0.996)
Data Type: float64, Column: instrumentalness, Range: (0.0, 1.0)
Data Type: float64, Column: liveness, Range: (0.0, 1.0)
Data Type: float64, Column: valence, Range: (0.0, 0.995)
Data Type: float64, Column: tempo, Range: (0.0, 1.0)
Data Type: float64, Column: time_signature, Range: (0.0, 1.0)



---

## Adding Dummy Variables


In [15]:
print(data_cleaned.dtypes)

track_id             object
artists              object
album_name           object
track_name           object
popularity            int64
duration_ms         float64
explicit               bool
danceability        float64
energy              float64
key                 float64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature      float64
track_genre          object
dtype: object


In [16]:
#Changing 'Exlicit' Column to 0s and 1s (int)

# Convert boolean column to 0 and 1
data_cleaned['explicit'] = data_cleaned['explicit'].astype(int)

In [17]:
unique_values = data_cleaned['track_genre'].unique()

# Checking the count of unique values
print(f"Unique values in genre: {len(unique_values)}")
print(unique_values)

#There are too many genres that are equally represented 

Unique values in genre: 113
['acoustic' 'afrobeat' 'alt-rock' 'alternative' 'ambient' 'anime'
 'black-metal' 'bluegrass' 'blues' 'brazil' 'breakbeat' 'british'
 'cantopop' 'chicago-house' 'children' 'chill' 'classical' 'club' 'comedy'
 'country' 'dance' 'dancehall' 'death-metal' 'deep-house' 'detroit-techno'
 'disco' 'disney' 'drum-and-bass' 'dub' 'dubstep' 'edm' 'electro'
 'electronic' 'emo' 'folk' 'forro' 'french' 'funk' 'garage' 'german'
 'gospel' 'goth' 'grindcore' 'groove' 'grunge' 'guitar' 'happy'
 'hard-rock' 'hardcore' 'hardstyle' 'heavy-metal' 'hip-hop' 'honky-tonk'
 'house' 'idm' 'indian' 'indie-pop' 'indie' 'industrial' 'iranian'
 'j-dance' 'j-idol' 'j-pop' 'j-rock' 'jazz' 'k-pop' 'kids' 'latin'
 'latino' 'malay' 'mandopop' 'metal' 'metalcore' 'minimal-techno' 'mpb'
 'new-age' 'opera' 'pagode' 'party' 'piano' 'pop-film' 'pop' 'power-pop'
 'progressive-house' 'psych-rock' 'punk-rock' 'punk' 'r-n-b' 'reggae'
 'reggaeton' 'rock-n-roll' 'rock' 'rockabilly' 'romance' 'sad' 'salsa

In [18]:
genre_counts = data_cleaned['track_genre'].value_counts()

# Print or use the value counts
print("Value counts for track_genre column:")
print(genre_counts)

Value counts for track_genre column:
track_genre
acoustic     1000
alt-rock      999
tango         999
ambient       999
afrobeat      999
             ... 
metal         232
punk          226
house         210
indie         134
reggaeton      74
Name: count, Length: 113, dtype: int64



---

## Saving the cleaned Data


In [19]:
# Save DataFrame to CSV
file_path = 'Resources/cleaned_data_with_objects.csv'
file_path_2 = 'Resources/data_for_model.csv'


data_cleaned.to_csv(file_path, index=False)

In [20]:
#Columns to drop: track_genre, track_id, artists, album_name, track_name

columns_to_drop = ['track_genre', 'track_id', 'artists', 'album_name', 'track_name']
data_cleaned.drop(columns=columns_to_drop, inplace=True)
data_cleaned.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,73,0.042473,0,0.676,0.461,0.090909,0.791392,0,0.143,0.0322,1e-06,0.358,0.715,0.361245,0.8
1,55,0.026971,0,0.42,0.166,0.090909,0.597377,1,0.0763,0.924,6e-06,0.101,0.267,0.318397,0.8
2,57,0.038679,0,0.438,0.359,0.0,0.736123,1,0.0557,0.21,0.0,0.117,0.12,0.313643,0.8
3,71,0.036978,0,0.266,0.0596,0.0,0.573701,1,0.0363,0.905,7.1e-05,0.132,0.143,0.746758,0.6
4,82,0.036389,0,0.618,0.443,0.181818,0.737103,1,0.0526,0.469,0.0,0.0829,0.167,0.492863,0.8


In [21]:
data_cleaned.to_csv(file_path_2, index=False)

In [22]:
# Training the Model
eagle_eye = EagleEye(file_path_2, 'popularity')
eagle_eye.train_model()
evaluation = eagle_eye.evaluate_model()
print(evaluation)

Splitting data...
Training model...
Evaluating model...
Making predictions...
Calculating mean_squared_error...
Calculating mean_absolute_error...
Calculating r2_score...
Calculating roc_auc_score...


ValueError: multi_class must be in ('ovo', 'ovr')