In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import tensorflow as tf

In [2]:
# Import the input dataset
artists_df = pd.read_csv('Resources/data_by_artist_o.csv')
artists_df.head()

Unnamed: 0,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,['show tunes'],"""Cats"" 1981 Original London Cast",0.590111,0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5,1,9
1,[],"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5,1,26
2,[],"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0,1,7
3,[],"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0,1,27
4,[],"""Joseph And The Amazing Technicolor Dreamcoat""...",0.510714,0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5,1,7


In [3]:
# Check the statistics
artists_df.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
count,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0,28680.0
mean,0.498373,0.54649,238878.0,0.497488,0.174756,0.202441,-11.140498,0.094014,115.84483,0.512723,34.060945,5.412901,0.75917,13.847211
std,0.370614,0.176474,121131.8,0.254885,0.298406,0.140884,5.771749,0.111986,25.003834,0.244421,22.376438,3.480552,0.427595,53.372544
min,0.0,0.0,18795.5,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.122296,0.431,182330.4,0.283568,4e-06,0.110363,-13.972292,0.0392,99.3665,0.329,12.0,2.0,1.0,2.0
50%,0.478458,0.557,218640.0,0.504,0.00188,0.161,-10.088938,0.0522,115.3574,0.523243,39.0,6.0,1.0,3.0
75%,0.896,0.675,268467.0,0.702783,0.215291,0.247,-6.889,0.0953,129.84875,0.703,51.0,8.0,1.0,8.0
max,0.996,0.986,5403500.0,1.0,1.0,0.991,1.342,0.964,217.743,0.991,93.0,11.0,1.0,3169.0


In [4]:
# Check the correlation of dataframe
artists_df.corr()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
acousticness,1.0,-0.422022,-0.059221,-0.801428,0.289239,0.033747,-0.628415,-0.035136,-0.26303,-0.205834,-0.55679,-0.037106,0.12363,0.022607
danceability,-0.422022,1.0,-0.113327,0.400499,-0.314969,-0.102579,0.474819,0.264533,0.119614,0.598748,0.246283,0.031761,-0.106127,-0.026207
duration_ms,-0.059221,-0.113327,1.0,-0.001462,0.117652,-0.020755,-0.083073,0.013852,-0.045021,-0.200101,0.010137,-0.011508,-0.033217,-0.008008
energy,-0.801428,0.400499,-0.001462,1.0,-0.289875,0.101859,0.794341,0.070858,0.306829,0.378717,0.415092,0.043071,-0.112821,-0.034319
instrumentalness,0.289239,-0.314969,0.117652,-0.289875,1.0,-0.059168,-0.446148,-0.150405,-0.128423,-0.257362,-0.235548,-0.018866,-0.024547,0.00376
liveness,0.033747,-0.102579,-0.020755,0.101859,-0.059168,1.0,0.05342,0.176047,-0.032469,0.013261,-0.120099,-0.002136,0.023442,0.009247
loudness,-0.628415,0.474819,-0.083073,0.794341,-0.446148,0.05342,1.0,0.04714,0.271227,0.389259,0.332941,0.031648,-0.074299,-0.032757
speechiness,-0.035136,0.264533,0.013852,0.070858,-0.150405,0.176047,0.04714,1.0,-0.016335,0.105061,-0.025825,0.014144,-0.049192,0.015201
tempo,-0.26303,0.119614,-0.045021,0.306829,-0.128423,-0.032469,0.271227,-0.016335,1.0,0.196186,0.132922,0.00467,-0.012196,0.000523
valence,-0.205834,0.598748,-0.200101,0.378717,-0.257362,0.013261,0.389259,0.105061,0.196186,1.0,0.002005,0.03778,-0.011891,-0.000422


In [5]:
artists_df.shape

(28680, 16)

In [6]:
# Check the dataframe for any null values
artists_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28680 entries, 0 to 28679
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genres            28680 non-null  object 
 1   artists           28680 non-null  object 
 2   acousticness      28680 non-null  float64
 3   danceability      28680 non-null  float64
 4   duration_ms       28680 non-null  float64
 5   energy            28680 non-null  float64
 6   instrumentalness  28680 non-null  float64
 7   liveness          28680 non-null  float64
 8   loudness          28680 non-null  float64
 9   speechiness       28680 non-null  float64
 10  tempo             28680 non-null  float64
 11  valence           28680 non-null  float64
 12  popularity        28680 non-null  float64
 13  key               28680 non-null  int64  
 14  mode              28680 non-null  int64  
 15  count             28680 non-null  int64  
dtypes: float64(11), int64(3), object(2)
memo

In [7]:
# Generate our categorical variable lists
artists_cat = artists_df.dtypes[artists_df.dtypes == "object"].index.tolist()

In [8]:
# Check the number of unique values in each column
artists_df[artists_cat].nunique()

genres     10743
artists    28680
dtype: int64

## Transform the Range of loudness to [0,1]

In [9]:
# Transform loudness range to [0,1]
def loudness_range(loudness):
     if loudness < 0:
        return 0
     elif loudness == 0:
        return 1
    
artists_df['loudness'] = artists_df['loudness'].apply(loudness_range)
artists_df['loudness'].unique()

array([ 0., nan])

In [10]:
# display the dataframe
display(artists_df)

Unnamed: 0,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,['show tunes'],"""Cats"" 1981 Original London Cast",0.590111,0.467222,250318.555556,0.394003,0.011400,0.290833,0.0,0.210389,117.518111,0.389500,38.333333,5,1,9
1,[],"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287280.000000,0.406808,0.081158,0.315215,0.0,0.176212,103.044154,0.268865,30.576923,5,1,26
2,[],"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328920.000000,0.286571,0.024593,0.325786,0.0,0.118514,77.375857,0.354857,34.857143,0,1,7
3,[],"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262890.962963,0.245770,0.073587,0.275481,0.0,0.123200,88.667630,0.372030,34.851852,0,1,27
4,[],"""Joseph And The Amazing Technicolor Dreamcoat""...",0.510714,0.467143,270436.142857,0.488286,0.009400,0.195000,0.0,0.098543,122.835857,0.482286,43.000000,5,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28675,[],麥志誠,0.512000,0.356000,198773.000000,0.306000,0.008970,0.108000,0.0,0.027700,150.049000,0.328000,35.000000,10,1,2
28676,"['c-pop', 'classic cantopop', 'classic mandopo...",黃品源,0.541000,0.578000,293840.000000,0.334000,0.000006,0.067500,0.0,0.026700,135.934000,0.243000,48.000000,9,0,2
28677,[],黃國隆,0.785455,0.570818,174582.727273,0.148400,0.000083,0.142191,0.0,0.054355,119.586273,0.741273,23.000000,5,1,11
28678,"['chinese indie', 'chinese indie rock']",黑豹,0.381000,0.353000,316160.000000,0.686000,0.000000,0.056800,0.0,0.039500,200.341000,0.352000,35.000000,11,1,2


In [11]:
# Check for any Null values in the loudness variable
artists_df["loudness"].isnull().sum

<bound method Series.sum of 0        False
1        False
2        False
3        False
4        False
         ...  
28675    False
28676    False
28677    False
28678    False
28679    False
Name: loudness, Length: 28680, dtype: bool>

## Make a new column for duration in seconds 

In [12]:
# Devide the duration_ms by 1000
artists_df["duration_ms"] = artists_df["duration_ms"]/1000
artists_df.head()

Unnamed: 0,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,['show tunes'],"""Cats"" 1981 Original London Cast",0.590111,0.467222,250.318556,0.394003,0.0114,0.290833,0.0,0.210389,117.518111,0.3895,38.333333,5,1,9
1,[],"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287.28,0.406808,0.081158,0.315215,0.0,0.176212,103.044154,0.268865,30.576923,5,1,26
2,[],"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328.92,0.286571,0.024593,0.325786,0.0,0.118514,77.375857,0.354857,34.857143,0,1,7
3,[],"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262.890963,0.24577,0.073587,0.275481,0.0,0.1232,88.66763,0.37203,34.851852,0,1,27
4,[],"""Joseph And The Amazing Technicolor Dreamcoat""...",0.510714,0.467143,270.436143,0.488286,0.0094,0.195,0.0,0.098543,122.835857,0.482286,43.0,5,1,7


## Transform the popularity to a binary feature

In [13]:
# Selecting rows based on condition 
def popularity_binary(popularity):
     if popularity >= 50:
        return 1
     elif popularity < 50:
        return 0
    
artists_df['popularity'] = artists_df['popularity'].apply(popularity_binary)
artists_df.head()

Unnamed: 0,genres,artists,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,['show tunes'],"""Cats"" 1981 Original London Cast",0.590111,0.467222,250.318556,0.394003,0.0114,0.290833,0.0,0.210389,117.518111,0.3895,0,5,1,9
1,[],"""Cats"" 1983 Broadway Cast",0.862538,0.441731,287.28,0.406808,0.081158,0.315215,0.0,0.176212,103.044154,0.268865,0,5,1,26
2,[],"""Fiddler On The Roof” Motion Picture Chorus",0.856571,0.348286,328.92,0.286571,0.024593,0.325786,0.0,0.118514,77.375857,0.354857,0,0,1,7
3,[],"""Fiddler On The Roof” Motion Picture Orchestra",0.884926,0.425074,262.890963,0.24577,0.073587,0.275481,0.0,0.1232,88.66763,0.37203,0,0,1,27
4,[],"""Joseph And The Amazing Technicolor Dreamcoat""...",0.510714,0.467143,270.436143,0.488286,0.0094,0.195,0.0,0.098543,122.835857,0.482286,0,5,1,7


## Encoding the artists variable 

In [14]:
# Count the number of unique artists
artist_names = np.unique(artists_df["artists"])
print("Artist Aames: ", artist_names)
print("No of unique artists in the list are:", len(artist_names))

Artist Aames:  ['"Cats" 1981 Original London Cast' '"Cats" 1983 Broadway Cast'
 '"Fiddler On The Roof” Motion Picture Chorus' ... '黃國隆' '黑豹' '조정현']
No of unique artists in the list are: 28680


In [15]:
# Check the value counts of the artists
artist_count = artists_df['artists'].value_counts()
print(artist_count)

SCHALA                        1
Péter Pálinkás                1
GoldLink                      1
Kavita Seth                   1
Einar Steen-Nøkleberg         1
                             ..
The Incredible String Band    1
Burl Ives                     1
Neil Diamond                  1
Robert Pagan                  1
Tommy Boyce                   1
Name: artists, Length: 28680, dtype: int64


In [16]:
# Visualize the value counts of artists
# artist_count.plot.density()

In [17]:
# Create a variable to count the number of songs of each artist
# artists_df['song_counts'] = [sum(artists_df['artists'] == artists_df['artists'][i]) for i in range(len(artists_df))]
# artists_df

In [18]:
# Check the nummber of null values
artists_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28680 entries, 0 to 28679
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genres            28680 non-null  object 
 1   artists           28680 non-null  object 
 2   acousticness      28680 non-null  float64
 3   danceability      28680 non-null  float64
 4   duration_ms       28680 non-null  float64
 5   energy            28680 non-null  float64
 6   instrumentalness  28680 non-null  float64
 7   liveness          28680 non-null  float64
 8   loudness          28670 non-null  float64
 9   speechiness       28680 non-null  float64
 10  tempo             28680 non-null  float64
 11  valence           28680 non-null  float64
 12  popularity        28680 non-null  int64  
 13  key               28680 non-null  int64  
 14  mode              28680 non-null  int64  
 15  count             28680 non-null  int64  
dtypes: float64(10), int64(4), object(2)
memo

In [19]:
# Drop the artists column and replace that with song_counts 
artists_df = artists_df.drop(['artists'], axis='columns')
artists_df.head()

Unnamed: 0,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,['show tunes'],0.590111,0.467222,250.318556,0.394003,0.0114,0.290833,0.0,0.210389,117.518111,0.3895,0,5,1,9
1,[],0.862538,0.441731,287.28,0.406808,0.081158,0.315215,0.0,0.176212,103.044154,0.268865,0,5,1,26
2,[],0.856571,0.348286,328.92,0.286571,0.024593,0.325786,0.0,0.118514,77.375857,0.354857,0,0,1,7
3,[],0.884926,0.425074,262.890963,0.24577,0.073587,0.275481,0.0,0.1232,88.66763,0.37203,0,0,1,27
4,[],0.510714,0.467143,270.436143,0.488286,0.0094,0.195,0.0,0.098543,122.835857,0.482286,0,5,1,7


## Encoding the Genres

In [20]:
# Count the number of unique genres
genres = np.unique(artists_df["genres"])
print("Gendres list : ", genres)
print("No of unique gendres in the list are:", len(genres))

Gendres list :  ['["australian children\'s music", "children\'s folk", "children\'s music", \'kindie rock\', "preschool children\'s music"]'
 '["australian children\'s music", "children\'s music", "preschool children\'s music"]'
 '["australian children\'s music", "children\'s music"]' ...
 "['zurich indie']" "['zydeco']" '[]']
No of unique gendres in the list are: 10743


In [21]:
# Check the value counts of the variable genres
gendre_count = artists_df['genres'].value_counts()
print(gendre_count)

[]                                                                                                                 9857
['movie tunes']                                                                                                      69
['show tunes']                                                                                                       63
['hollywood']                                                                                                        56
['orchestral performance']                                                                                           50
                                                                                                                   ... 
['electropop', 'indie pop', 'indie poptimism', 'modern rock', 'nyc pop', 'pop', 'pop rock', 'stomp and holler']       1
['chillwave', 'indie poptimism', 'shimmer pop']                                                                       1
['french romanticism', 'post-romantic er

In [22]:
# Check for any null value in the genres column
artists_df['genres'].isnull().sum

<bound method Series.sum of 0        False
1        False
2        False
3        False
4        False
         ...  
28675    False
28676    False
28677    False
28678    False
28679    False
Name: genres, Length: 28680, dtype: bool>

In [23]:
# Classify the first populated country in one group and the rest in other 
artists_df['genres'] = artists_df['genres'].apply(lambda genre: genre if genre in 
                                ("['movie tunes']", "['show tunes']", 
                                 "['hollywood']", "['orchestral performance']") else 'Other')
artists_df.head()

Unnamed: 0,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count
0,['show tunes'],0.590111,0.467222,250.318556,0.394003,0.0114,0.290833,0.0,0.210389,117.518111,0.3895,0,5,1,9
1,Other,0.862538,0.441731,287.28,0.406808,0.081158,0.315215,0.0,0.176212,103.044154,0.268865,0,5,1,26
2,Other,0.856571,0.348286,328.92,0.286571,0.024593,0.325786,0.0,0.118514,77.375857,0.354857,0,0,1,7
3,Other,0.884926,0.425074,262.890963,0.24577,0.073587,0.275481,0.0,0.1232,88.66763,0.37203,0,0,1,27
4,Other,0.510714,0.467143,270.436143,0.488286,0.0094,0.195,0.0,0.098543,122.835857,0.482286,0,5,1,7


In [24]:
# Check the genres value counts in a dataframe
artists_df.genres.value_counts(dropna=False).to_frame()

Unnamed: 0,genres
Other,28442
['movie tunes'],69
['show tunes'],63
['hollywood'],56
['orchestral performance'],50


## Use OneHotEncoder and merge to the original dataframe

In [25]:
# Generate our categorical variable lists
artists_cat = artists_df.dtypes[artists_df.dtypes == "object"].index.tolist()
print(artists_cat)

['genres']


In [26]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
artists_encode_df = pd.DataFrame(enc.fit_transform(artists_df[artists_cat]))

# Add the encoded variable names to the dataframe
artists_encode_df.columns = enc.get_feature_names(artists_cat)
artists_encode_df.head()

Unnamed: 0,genres_Other,genres_['hollywood'],genres_['movie tunes'],genres_['orchestral performance'],genres_['show tunes']
0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0


In [27]:
# Rename the columns of artists_encode_df
for column in artists_encode_df.columns:
    new_column = column
    if '[' in column:
        new_column = new_column.replace('[\'', '')
    if ']' in column:
        new_column = new_column.replace('\']', '')
    artists_encode_df = artists_encode_df.rename(columns={column: new_column})
artists_encode_df.head()

Unnamed: 0,genres_Other,genres_hollywood,genres_movie tunes,genres_orchestral performance,genres_show tunes
0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0


In [28]:
# Merge one-hot encoded features and drop the originals
artists_df = artists_df.merge(artists_encode_df,left_index=True, right_index=True)
artists_df = artists_df.drop(artists_cat,1)
artists_df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres_Other,genres_hollywood,genres_movie tunes,genres_orchestral performance,genres_show tunes
0,0.590111,0.467222,250.318556,0.394003,0.0114,0.290833,0.0,0.210389,117.518111,0.3895,0,5,1,9,0.0,0.0,0.0,0.0,1.0
1,0.862538,0.441731,287.28,0.406808,0.081158,0.315215,0.0,0.176212,103.044154,0.268865,0,5,1,26,1.0,0.0,0.0,0.0,0.0
2,0.856571,0.348286,328.92,0.286571,0.024593,0.325786,0.0,0.118514,77.375857,0.354857,0,0,1,7,1.0,0.0,0.0,0.0,0.0
3,0.884926,0.425074,262.890963,0.24577,0.073587,0.275481,0.0,0.1232,88.66763,0.37203,0,0,1,27,1.0,0.0,0.0,0.0,0.0
4,0.510714,0.467143,270.436143,0.488286,0.0094,0.195,0.0,0.098543,122.835857,0.482286,0,5,1,7,1.0,0.0,0.0,0.0,0.0


In [29]:
# Check the uniques in each column
artists_df.nunique()

acousticness                     14263
danceability                     10752
duration_ms                      23960
energy                           12290
instrumentalness                 15599
liveness                         12624
loudness                             1
speechiness                      11699
tempo                            24860
valence                          12024
popularity                           2
key                                 12
mode                                 2
count                              379
genres_Other                         2
genres_hollywood                     2
genres_movie tunes                   2
genres_orchestral performance        2
genres_show tunes                    2
dtype: int64

In [30]:
# Check the size of dataset
artists_df.shape

(28680, 19)

In [31]:
# Check for any Nans
artists_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28680 entries, 0 to 28679
Data columns (total 19 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   acousticness                   28680 non-null  float64
 1   danceability                   28680 non-null  float64
 2   duration_ms                    28680 non-null  float64
 3   energy                         28680 non-null  float64
 4   instrumentalness               28680 non-null  float64
 5   liveness                       28680 non-null  float64
 6   loudness                       28670 non-null  float64
 7   speechiness                    28680 non-null  float64
 8   tempo                          28680 non-null  float64
 9   valence                        28680 non-null  float64
 10  popularity                     28680 non-null  int64  
 11  key                            28680 non-null  int64  
 12  mode                           28680 non-null 

## Standardize the DataSet

In [32]:
# use MinMax Scaler for the dataframe and dropna all the Nan values
for column in list(artists_df):
    x = MinMaxScaler().fit_transform(np.array(artists_df[column]).reshape(-1, 1)).tolist()
    artists_df[column] = [i[0] for i in x]
artists_df = artists_df.dropna()

In [33]:
# Check the statistics
artists_df.describe()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key,mode,count,genres_Other,genres_hollywood,genres_movie tunes,genres_orchestral performance,genres_show tunes
count,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0,28670.0
mean,0.500432,0.554261,0.040876,0.497341,0.174798,0.204245,0.0,0.097491,0.532042,0.517364,0.280537,0.492082,0.759261,0.004057,0.991699,0.001953,0.002407,0.001744,0.002197
std,0.372089,0.178992,0.022497,0.254803,0.298442,0.142142,0.0,0.116147,0.114785,0.246625,0.44927,0.316382,0.42754,0.01685,0.090734,0.044153,0.049,0.041725,0.046826
min,0.0,0.0,0.000278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.122858,0.43712,0.030374,0.2835,4e-06,0.111352,0.0,0.040664,0.456375,0.331988,0.0,0.181818,1.0,0.000316,1.0,0.0,0.0,0.0,0.0
50%,0.480505,0.564909,0.037113,0.504,0.00188,0.162462,0.0,0.054149,0.529787,0.528007,0.0,0.545455,1.0,0.000631,1.0,0.0,0.0,0.0,0.0
75%,0.899598,0.684584,0.046367,0.702625,0.215339,0.249243,0.0,0.098803,0.596331,0.709384,1.0,0.727273,1.0,0.00221,1.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
# Remove loan status target from features data
y = artists_df.popularity
X = artists_df.drop(columns=["popularity"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Logistic Regression

In [35]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.743


## Nueral Network

In [37]:
# Define the basic neural network model
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=16, activation="relu", input_dim=18))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
224/224 - 0s - loss: 0.4442 - accuracy: 0.7836
Loss: 0.4442385733127594, Accuracy: 0.7836216688156128


## Random Forest Model

In [38]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.817


## Deep Learning Model

In [42]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="tanh")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
224/224 - 0s - loss: 0.4342 - accuracy: 0.7879
Loss: 0.4341844618320465, Accuracy: 0.7879464030265808


In [None]:
# file_path = Path("Resources/new_artist_data.csv")
# new_artist_df.to_csv(file_path, index=False)
