In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from src.classes.EagleEye import EagleEye
from sklearn.metrics import classification_report



---

## Importing and Exploring the data

In [2]:
# Load the CSV file
file_path = 'Resources/dataset.csv'  # Replace with your actual file path if different
data = pd.read_csv(file_path)

data.head()


Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [3]:
data.shape

(114000, 21)

In [4]:
for column in data.columns:
    print(column)

Unnamed: 0
track_id
artists
album_name
track_name
popularity
duration_ms
explicit
danceability
energy
key
loudness
mode
speechiness
acousticness
instrumentalness
liveness
valence
tempo
time_signature
track_genre


In [5]:
print(data.dtypes)

Unnamed: 0            int64
track_id             object
artists              object
album_name           object
track_name           object
popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre          object
dtype: object



---

## Cleaning the Data


In [6]:
# Unamed: 0 Column appears to be unused. Dropping it.
data_cleaned = data.drop(columns=['Unnamed: 0'])

#Based on some testing, we've found that Explicit and Time signiture are adding very little value
"""
--- Feature Importances ---
             Feature  Importance
0         popularity    0.134362
7        speechiness    0.096100
8       acousticness    0.089943
3       danceability    0.084025
1        duration_ms    0.081759
9   instrumentalness    0.078997
11           valence    0.076097
4             energy    0.074002
6           loudness    0.073301
10          liveness    0.068000
12             tempo    0.066104
5                key    0.051305
13    time_signature    0.018006
2           explicit    0.007998
"""
data_cleaned = data_cleaned.drop(columns=['explicit', 'time_signature'])

In [7]:
#Checking for rows with nulls

null_rows = data_cleaned[data_cleaned.isnull().any(axis=1)]

# Display or use the resulting DataFrame with null values
null_rows

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre
65900,1kR4gIb7nGxHPI3D2ifs59,,,,0,0,0.501,0.583,7,-9.46,0,0.0605,0.69,0.00396,0.0747,0.734,138.391,k-pop


In [8]:
data_cleaned = data_cleaned.dropna()

In [9]:
missing_values_count = data_cleaned.isnull().sum()
missing_values_count

track_id            0
artists             0
album_name          0
track_name          0
popularity          0
duration_ms         0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
track_genre         0
dtype: int64

In [10]:
#Getting a list of a genres

unique_values = data_cleaned['track_genre'].unique()
print(unique_values)

['acoustic' 'afrobeat' 'alt-rock' 'alternative' 'ambient' 'anime'
 'black-metal' 'bluegrass' 'blues' 'brazil' 'breakbeat' 'british'
 'cantopop' 'chicago-house' 'children' 'chill' 'classical' 'club' 'comedy'
 'country' 'dance' 'dancehall' 'death-metal' 'deep-house' 'detroit-techno'
 'disco' 'disney' 'drum-and-bass' 'dub' 'dubstep' 'edm' 'electro'
 'electronic' 'emo' 'folk' 'forro' 'french' 'funk' 'garage' 'german'
 'gospel' 'goth' 'grindcore' 'groove' 'grunge' 'guitar' 'happy'
 'hard-rock' 'hardcore' 'hardstyle' 'heavy-metal' 'hip-hop' 'honky-tonk'
 'house' 'idm' 'indian' 'indie-pop' 'indie' 'industrial' 'iranian'
 'j-dance' 'j-idol' 'j-pop' 'j-rock' 'jazz' 'k-pop' 'kids' 'latin'
 'latino' 'malay' 'mandopop' 'metal' 'metalcore' 'minimal-techno' 'mpb'
 'new-age' 'opera' 'pagode' 'party' 'piano' 'pop-film' 'pop' 'power-pop'
 'progressive-house' 'psych-rock' 'punk-rock' 'punk' 'r-n-b' 'reggae'
 'reggaeton' 'rock-n-roll' 'rock' 'rockabilly' 'romance' 'sad' 'salsa'
 'samba' 'sertanejo' 'show

In [11]:
len(unique_values)

114

In [12]:
#We need to reduce the number of gentres. Here's a dictionary we can use to map the genres into a smaller list.

genre_mapping = {
    'acoustic': 'acoustic',
    'afrobeat': 'world',
    'alt-rock': 'alternative',
    'alternative': 'alternative',
    'ambient': 'electronic',
    'anime': 'miscellaneous',
    'black-metal': 'metal',
    'bluegrass': 'folk',
    'blues': 'blues',
    'brazil': 'latin',
    'breakbeat': 'electronic',
    'british': 'regional',
    'cantopop': 'pop',
    'chicago-house': 'electronic',
    'children': 'children',
    'chill': 'electronic',
    'classical': 'classical',
    'club': 'electronic',
    'comedy': 'miscellaneous',
    'country': 'country',
    'dance': 'electronic',
    'dancehall': 'reggae',
    'death-metal': 'metal',
    'deep-house': 'electronic',
    'detroit-techno': 'electronic',
    'disco': 'electronic',
    'disney': 'miscellaneous',
    'drum-and-bass': 'electronic',
    'dub': 'reggae',
    'dubstep': 'electronic',
    'edm': 'electronic',
    'electro': 'electronic',
    'electronic': 'electronic',
    'emo': 'alternative',
    'folk': 'folk',
    'forro': 'latin',
    'french': 'regional',
    'funk': 'funk',
    'garage': 'electronic',
    'german': 'regional',
    'gospel': 'religious',
    'goth': 'alternative',
    'grindcore': 'metal',
    'groove': 'funk',
    'grunge': 'alternative',
    'guitar': 'miscellaneous',
    'happy': 'miscellaneous',
    'hard-rock': 'rock',
    'hardcore': 'punk',
    'hardstyle': 'electronic',
    'heavy-metal': 'metal',
    'hip-hop': 'hip-hop',
    'honky-tonk': 'country',
    'house': 'electronic',
    'idm': 'electronic',
    'indian': 'world',
    'indie-pop': 'alternative',
    'indie': 'alternative',
    'industrial': 'electronic',
    'iranian': 'world',
    'j-dance': 'world',
    'j-idol': 'world',
    'j-pop': 'pop',
    'j-rock': 'rock',
    'jazz': 'jazz',
    'k-pop': 'pop',
    'kids': 'children',
    'latin': 'latin',
    'latino': 'latin',
    'malay': 'world',
    'mandopop': 'pop',
    'metal': 'metal',
    'metalcore': 'metal',
    'minimal-techno': 'electronic',
    'mpb': 'latin',
    'new-age': 'miscellaneous',
    'opera': 'classical',
    'pagode': 'latin',
    'party': 'miscellaneous',
    'piano': 'classical',
    'pop-film': 'pop',
    'pop': 'pop',
    'power-pop': 'pop',
    'progressive-house': 'electronic',
    'psych-rock': 'rock',
    'punk-rock': 'punk',
    'punk': 'punk',
    'r-n-b': 'r&b',
    'reggae': 'reggae',
    'reggaeton': 'latin',
    'rock-n-roll': 'rock',
    'rock': 'rock',
    'rockabilly': 'rock',
    'romance': 'miscellaneous',
    'sad': 'miscellaneous',
    'salsa': 'latin',
    'samba': 'latin',
    'sertanejo': 'latin',
    'show-tunes': 'miscellaneous',
    'singer-songwriter': 'acoustic',
    'ska': 'reggae',
    'sleep': 'miscellaneous',
    'songwriter': 'acoustic',
    'soul': 'funk',
    'spanish': 'world',
    'study': 'miscellaneous',
    'swedish': 'regional',
    'synth-pop': 'pop',
    'tango': 'latin',
    'techno': 'electronic',
    'trance': 'electronic',
    'trip-hop': 'electronic',
    'turkish': 'world',
    'world-music': 'world'
}



In [13]:
# Apply the genre mapping
data_cleaned['track_genre'] = data_cleaned['track_genre'].map(genre_mapping)

In [14]:
#The new genre values
# Genre Mapping:
# acoustic: acoustic, singer-songwriter, songwriter
# alternative: alt-rock, alternative, emo, goth, grunge, indie, indie-pop
# blues: blues
# children: children, kids
# classical: classical, opera, piano
# country: country, honky-tonk
# electronic: ambient, breakbeat, chicago-house, chill, club, dance, deep-house, detroit-techno,
#            disco, drum-and-bass, dubstep, edm, electro, electronic, garage, hardstyle, house,
#            idm, industrial, minimal-techno, progressive-house, techno, trance, trip-hop
# folk: bluegrass, folk
# funk: funk, groove, soul
# hip-hop: hip-hop
# jazz: jazz
# latin: brazil, forro, latin, latino, mpb, pagode, reggaeton, salsa, samba, sertanejo, tango
# metal: black-metal, death-metal, grindcore, heavy-metal, metal, metalcore
# miscellaneous: anime, comedy, disney, guitar, happy, new-age, party, romance, sad, show-tunes,
#                sleep, study
# pop: cantopop, j-pop, k-pop, mandopop, pop, pop-film, power-pop, synth-pop
# punk: hardcore, punk, punk-rock
# r&b: r-n-b
# reggae: dancehall, dub, reggae, ska
# regional: british, french, german, swedish
# religious: gospel
# rock: hard-rock, j-rock, psych-rock, rock, rock-n-roll, rockabilly
# world: afrobeat, indian, iranian, j-dance, j-idol, malay, spanish, turkish, world-music
unique_values = data_cleaned['track_genre'].unique()
print(unique_values)

['acoustic' 'world' 'alternative' 'electronic' 'miscellaneous' 'metal'
 'folk' 'blues' 'latin' 'regional' 'pop' 'children' 'classical' 'country'
 'reggae' 'funk' 'religious' 'rock' 'punk' 'hip-hop' 'jazz' 'r&b']


In [15]:
#How many categories there were versus now.
len(unique_values)

22

In [16]:
# Group by track_id and track_genre, then count the occurrences
grouped_df = data_cleaned.groupby(['track_id', 'track_genre']).size().reset_index(name='count')

# Group by track_genre to get the count by track_genre
genre_count = grouped_df.groupby('track_genre')['count'].sum().reset_index()

print(genre_count)


      track_genre  count
0        acoustic   3000
1     alternative   7000
2           blues   1000
3        children   2000
4       classical   3000
5         country   2000
6      electronic  24000
7            folk   2000
8            funk   3000
9         hip-hop   1000
10           jazz   1000
11          latin  11000
12          metal   6000
13  miscellaneous  12000
14            pop   7999
15           punk   3000
16            r&b   1000
17         reggae   4000
18       regional   4000
19      religious   1000
20           rock   6000
21          world   9000


In [17]:
data_cleaned.shape

(113999, 18)

In [18]:
#Since we've combined the genres many songs will be listed more than once for the same genre. We need to delete the redudant data.
grouped_df = data_cleaned.groupby(['track_id', 'track_genre']).size().reset_index(name='count')

# Filter to keep only those with count greater than 1
filtered_df = grouped_df[grouped_df['count'] > 1]

# Merge the filtered results back with the original DataFrame to get the duplicate rows
duplicate_rows = data_cleaned.merge(filtered_df[['track_id', 'track_genre']], on=['track_id', 'track_genre'], how='inner')

duplicate_rows.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,acoustic
1,62EwQLDw0dprDzuLwQ0sH1,KT Tunstall,Chill Christmas Dinner,Lonely This Christmas,0,257493,0.409,0.153,6,-10.74,0,0.0306,0.939,2.6e-05,0.108,0.18,85.262,acoustic
2,0adVktNVJXPZfhQe6B1NO5,KT Tunstall,sadsadchristmas,Lonely This Christmas,0,257493,0.409,0.153,6,-10.74,0,0.0306,0.939,2.6e-05,0.108,0.18,85.262,acoustic
3,3ILmwMefYZoQh5Cf5jeuUQ,Motohiro Hata,Documentary,透明だった世界,61,232360,0.373,0.914,0,-4.185,1,0.0565,0.076,0.0,0.669,0.56,168.21,acoustic
4,2kMrCPZ0o5gErBPLCRgoli,Gabrielle Aplin,Mellow Adult Pop,Heavy Heart,0,235173,0.451,0.7,10,-6.597,0,0.0431,0.455,0.00233,0.405,0.37,150.055,acoustic


In [19]:
data_cleaned = data_cleaned.drop_duplicates(subset=['track_id', 'track_genre'], keep='first')

# Group by track_id and track_genre, then count the occurrences
grouped_df = data_cleaned.groupby(['track_id', 'track_genre']).size().reset_index(name='count')

# Group by track_genre to get the count by track_genre
genre_count = grouped_df.groupby('track_genre')['count'].sum().reset_index()

print(genre_count)


      track_genre  count
0        acoustic   1984
1     alternative   5558
2           blues    998
3        children   1984
4       classical   2874
5         country   1981
6      electronic  21006
7            folk   1992
8            funk   2780
9         hip-hop    991
10           jazz    999
11          latin   8836
12          metal   5624
13  miscellaneous  11783
14            pop   7492
15           punk   2307
16            r&b   1000
17         reggae   3954
18       regional   3960
19      religious    999
20           rock   5398
21          world   8971


In [20]:
data_cleaned.shape

(103471, 18)


---

## Scaling the Data


In [21]:
ranges = {}

# Iterate through each column in the DataFrame
for column in data_cleaned.columns:
    # Check if the column is of float or integer type
    if data_cleaned[column].dtype in ['float64', 'int64']:
        # Calculate range of values
        min_val = data_cleaned[column].min()
        max_val = data_cleaned[column].max()
        
        # Store range and data type in dictionary
        ranges[column] = (min_val, max_val, data_cleaned[column].dtype)

# Print the range of values for each float or integer column
for column, (min_val, max_val, data_type) in ranges.items():
    print(f"Data Type: {data_type}, Column: {column}, Range: ({min_val}, {max_val})")

Data Type: int64, Column: popularity, Range: (0, 100)
Data Type: int64, Column: duration_ms, Range: (8586, 5237295)
Data Type: float64, Column: danceability, Range: (0.0, 0.985)
Data Type: float64, Column: energy, Range: (0.0, 1.0)
Data Type: int64, Column: key, Range: (0, 11)
Data Type: float64, Column: loudness, Range: (-49.531, 4.532)
Data Type: int64, Column: mode, Range: (0, 1)
Data Type: float64, Column: speechiness, Range: (0.0, 0.965)
Data Type: float64, Column: acousticness, Range: (0.0, 0.996)
Data Type: float64, Column: instrumentalness, Range: (0.0, 1.0)
Data Type: float64, Column: liveness, Range: (0.0, 1.0)
Data Type: float64, Column: valence, Range: (0.0, 0.995)
Data Type: float64, Column: tempo, Range: (0.0, 243.372)


In [22]:
# Create a new column for the interaction between danceability and speechiness
#data_cleaned['danceability_speechiness_interaction'] = data_cleaned['danceability'] * data_cleaned['speechiness']

# Display the first few rows to verify
#data_cleaned[['danceability', 'speechiness', 'danceability_speechiness_interaction']].head()

In [23]:
# Columns to scale
columns_to_scale = ['popularity', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the specified columns
data_cleaned[columns_to_scale] = scaler.fit_transform(data_cleaned[columns_to_scale])

data_cleaned.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,0.73,0.042473,0.686294,0.461,0.090909,0.791392,0.0,0.148187,0.032329,1e-06,0.358,0.718593,0.361245,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,0.55,0.026971,0.426396,0.166,0.090909,0.597377,1.0,0.079067,0.927711,6e-06,0.101,0.268342,0.318397,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,0.57,0.038679,0.44467,0.359,0.0,0.736123,1.0,0.05772,0.210843,0.0,0.117,0.120603,0.313643,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,0.71,0.036978,0.270051,0.0596,0.0,0.573701,1.0,0.037617,0.908635,7.1e-05,0.132,0.143719,0.746758,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,0.82,0.036389,0.627411,0.443,0.181818,0.737103,1.0,0.054508,0.470884,0.0,0.0829,0.167839,0.492863,acoustic


In [24]:
#Check the data ranges to see if it is scaled

ranges = {}

# Iterate through each column in the DataFrame
for column in data_cleaned.columns:
    # Check if the column is of float or integer type
    if data_cleaned[column].dtype in ['float64', 'int64']:
        # Calculate range of values
        min_val = data_cleaned[column].min()
        max_val = data_cleaned[column].max()
        
        # Store range and data type in dictionary
        ranges[column] = (min_val, max_val, data_cleaned[column].dtype)

# Print the range of values for each float or integer column
for column, (min_val, max_val, data_type) in ranges.items():
    print(f"Data Type: {data_type}, Column: {column}, Range: ({min_val}, {max_val})")

Data Type: float64, Column: popularity, Range: (0.0, 1.0)
Data Type: float64, Column: duration_ms, Range: (0.0, 1.0)
Data Type: float64, Column: danceability, Range: (0.0, 0.9999999999999999)
Data Type: float64, Column: energy, Range: (0.0, 1.0)
Data Type: float64, Column: key, Range: (0.0, 1.0)
Data Type: float64, Column: loudness, Range: (0.0, 0.9999999999999999)
Data Type: float64, Column: mode, Range: (0.0, 1.0)
Data Type: float64, Column: speechiness, Range: (0.0, 1.0)
Data Type: float64, Column: acousticness, Range: (0.0, 1.0)
Data Type: float64, Column: instrumentalness, Range: (0.0, 1.0)
Data Type: float64, Column: liveness, Range: (0.0, 1.0)
Data Type: float64, Column: valence, Range: (0.0, 0.9999999999999999)
Data Type: float64, Column: tempo, Range: (0.0, 1.0)



---

## Grooming Data


In [25]:
print(data_cleaned.dtypes)

track_id             object
artists              object
album_name           object
track_name           object
popularity          float64
duration_ms         float64
danceability        float64
energy              float64
key                 float64
loudness            float64
mode                float64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
track_genre          object
dtype: object


In [26]:
unique_values = data_cleaned['track_genre'].unique()

# Checking the count of unique values
print(f"Unique values in genre: {len(unique_values)}")
print(unique_values)

#There are too many genres that are equally represented 

Unique values in genre: 22
['acoustic' 'world' 'alternative' 'electronic' 'miscellaneous' 'metal'
 'folk' 'blues' 'latin' 'regional' 'pop' 'children' 'classical' 'country'
 'reggae' 'funk' 'religious' 'rock' 'punk' 'hip-hop' 'jazz' 'r&b']


In [27]:
genre_counts = data_cleaned['track_genre'].value_counts()

# Print or use the value counts
print("Value counts for track_genre column:")
print(genre_counts)

Value counts for track_genre column:
track_genre
electronic       21006
miscellaneous    11783
world             8971
latin             8836
pop               7492
metal             5624
alternative       5558
rock              5398
regional          3960
reggae            3954
classical         2874
funk              2780
punk              2307
folk              1992
acoustic          1984
children          1984
country           1981
r&b               1000
jazz               999
religious          999
blues              998
hip-hop            991
Name: count, dtype: int64



---

## Saving the cleaned Data


In [28]:
# Save DataFrame to CSV
file_path = 'Resources/cleaned_data_with_objects.csv'
file_path_2 = 'Resources/data_for_model.csv'


data_cleaned.to_csv(file_path, index=False)

In [29]:
#Columns to drop: track_genre, track_id, artists, album_name, track_name

columns_to_drop = ['track_id', 'artists', 'album_name', 'track_name','mode']
data_cleaned.drop(columns=columns_to_drop, inplace=True)
data_cleaned.head()


Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre
0,0.73,0.042473,0.686294,0.461,0.090909,0.791392,0.148187,0.032329,1e-06,0.358,0.718593,0.361245,acoustic
1,0.55,0.026971,0.426396,0.166,0.090909,0.597377,0.079067,0.927711,6e-06,0.101,0.268342,0.318397,acoustic
2,0.57,0.038679,0.44467,0.359,0.0,0.736123,0.05772,0.210843,0.0,0.117,0.120603,0.313643,acoustic
3,0.71,0.036978,0.270051,0.0596,0.0,0.573701,0.037617,0.908635,7.1e-05,0.132,0.143719,0.746758,acoustic
4,0.82,0.036389,0.627411,0.443,0.181818,0.737103,0.054508,0.470884,0.0,0.0829,0.167839,0.492863,acoustic


In [30]:
data_cleaned.to_csv(file_path_2, index=False)

In [31]:
# Splitting the data, removing outliers, and sampling the data.
eagle_eye = EagleEye(file_path_2, 'track_genre')
eagle_eye.X.head()


Splitting data...
Removing outliers from the training set based on a zscore tolerance of 3...
Synthetic Minority Oversampling Technique (SMOTE)


Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.73,0.042473,0.686294,0.461,0.090909,0.791392,0.148187,0.032329,1e-06,0.358,0.718593,0.361245
1,0.55,0.026971,0.426396,0.166,0.090909,0.597377,0.079067,0.927711,6e-06,0.101,0.268342,0.318397
2,0.57,0.038679,0.44467,0.359,0.0,0.736123,0.05772,0.210843,0.0,0.117,0.120603,0.313643
3,0.71,0.036978,0.270051,0.0596,0.0,0.573701,0.037617,0.908635,7.1e-05,0.132,0.143719,0.746758
4,0.82,0.036389,0.627411,0.443,0.181818,0.737103,0.054508,0.470884,0.0,0.0829,0.167839,0.492863



---

## Training and Testing Models

---

## Random Forest Model

In [32]:
#May take a long time to run if perform_random_search=True.
eagle_eye.train_randomforest_model()

Training model...
Model loaded from model.pkl
Deleted existing model.pkl
Finding best parameters...
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best Params: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': False}
Model saved to model.pkl


In [33]:
eagle_eye.evaluate_model()


Evaluating model...
Making predictions...
Getting accuracy metrics...
--- Feature Importances ---
             Feature  Importance
0         popularity    0.143214
7       acousticness    0.097516
2       danceability    0.087101
10           valence    0.085664
1        duration_ms    0.082802
3             energy    0.080923
11             tempo    0.074589
6        speechiness    0.072585
8   instrumentalness    0.070909
5           loudness    0.070269
9           liveness    0.067874
4                key    0.066555
---

--- Test Data Scores ---
accuracy: 0.4337018710375754
precision: 0.4480358974592282
recall: 0.4337018710375754
f1: 0.4370354192305087
classification_report:
               precision    recall  f1-score   support

     acoustic       0.21      0.30      0.25       505
  alternative       0.22      0.19      0.21      1477
        blues       0.19      0.25      0.22       251
     children       0.46      0.69      0.55       484
    classical       0.43      0.50 

---

## KNN Model

In [34]:
eagle_eye.train_knn_model()


In [35]:
eagle_eye.evaluate_model()


Evaluating model...
Making predictions...
Getting accuracy metrics...
Feature importance is only available for RandomForestClassifier.
--- Test Data Scores ---
accuracy: 0.2721895778568115
precision: 0.34634663588152287
recall: 0.2721895778568115
f1: 0.2874326074434115
classification_report:
               precision    recall  f1-score   support

     acoustic       0.11      0.20      0.14       505
  alternative       0.16      0.19      0.18      1477
        blues       0.08      0.22      0.12       251
     children       0.23      0.37      0.28       484
    classical       0.28      0.46      0.35       735
      country       0.34      0.50      0.40       490
   electronic       0.66      0.29      0.40      5176
         folk       0.13      0.25      0.17       523
         funk       0.14      0.23      0.18       700
      hip-hop       0.12      0.27      0.16       250
         jazz       0.25      0.44      0.32       254
        latin       0.41      0.34      0.37  

---

## Logistic Regression

In [36]:
eagle_eye.train_logistic_regression_model()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
eagle_eye.evaluate_model()


Evaluating model...
Making predictions...
Getting accuracy metrics...
Feature importance is only available for RandomForestClassifier.
--- Test Data Scores ---
accuracy: 0.23561929797433123
precision: 0.3349637853445963
recall: 0.23561929797433123
f1: 0.23874594852847084
classification_report:
               precision    recall  f1-score   support

     acoustic       0.13      0.32      0.18       505
  alternative       0.09      0.03      0.04      1477
        blues       0.03      0.04      0.03       251
     children       0.15      0.47      0.23       484
    classical       0.22      0.55      0.31       735
      country       0.15      0.30      0.20       490
   electronic       0.70      0.33      0.45      5176
         folk       0.13      0.13      0.13       523
         funk       0.08      0.02      0.03       700
      hip-hop       0.05      0.48      0.10       250
         jazz       0.13      0.51      0.21       254
        latin       0.40      0.27      0.33

---

## SVM Model

In [38]:
eagle_eye.train_svm_model()


In [39]:
eagle_eye.evaluate_model()


Evaluating model...
Making predictions...
Getting accuracy metrics...
Feature importance is only available for RandomForestClassifier.
--- Test Data Scores ---
accuracy: 0.3051260244317303
precision: 0.41024716904252034
recall: 0.3051260244317303
f1: 0.31561463355130104
classification_report:
               precision    recall  f1-score   support

     acoustic       0.15      0.37      0.21       505
  alternative       0.21      0.13      0.16      1477
        blues       0.09      0.26      0.14       251
     children       0.26      0.56      0.35       484
    classical       0.31      0.62      0.41       735
      country       0.28      0.54      0.37       490
   electronic       0.67      0.34      0.45      5176
         folk       0.17      0.34      0.23       523
         funk       0.17      0.18      0.18       700
      hip-hop       0.10      0.70      0.17       250
         jazz       0.23      0.56      0.33       254
        latin       0.56      0.37      0.45 

---

## XGBoost Model

In [40]:
eagle_eye.train_xgboost_model()


In [41]:
eagle_eye.evaluate_model()

Evaluating model...
Making predictions...
Getting accuracy metrics...
Feature importance is only available for RandomForestClassifier.
--- Test Data Scores ---
accuracy: 0.42910159270140713
precision: 0.45842659812822173
recall: 0.42910159270140713
f1: 0.4359232503971481
classification_report:
               precision    recall  f1-score   support

     acoustic       0.20      0.32      0.24       505
  alternative       0.25      0.18      0.21      1477
        blues       0.15      0.27      0.19       251
     children       0.44      0.67      0.53       484
    classical       0.40      0.54      0.46       735
      country       0.50      0.64      0.56       490
   electronic       0.65      0.52      0.58      5176
         folk       0.29      0.36      0.32       523
         funk       0.26      0.28      0.27       700
      hip-hop       0.19      0.52      0.27       250
         jazz       0.32      0.47      0.38       254
        latin       0.58      0.53      0.56