# *This is the startpoint of the project*

> ### **Importing libraries and data**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [17]:
df = pd.read_csv('SpotifyFeatures.csv')
same_df = pd.read_csv('SpotifyFeatures.csv') # Duplicatiing the dataframe to use it properly for charts

> ### *Exploring data*

In [None]:
print(df.shape)
df.head()

In [None]:
print(df.info())
df.describe()

In [None]:
# What and how much are the existing genres ?
df['genre'].unique()

In [None]:
df['time_signature'].unique()

> ### *Cleaning Data*

In [18]:
# Let's convert some columns to percentage format
def clean_data(df):
    # Making duration clearer
    df['duration_sec'] = df['duration_ms'] / 1000 # converting the duration from ms to s
    df['duration_min'] = df['duration_ms'] / (1000 * 60) # converting the duration from ms to min
    df['duration_sec'] = df['duration_sec'].apply(lambda x: f"{x:.2f} s")
    df['duration_min'] = df['duration_min'].apply(lambda x: f"{x:.2f} min")
    
    # Format tracks parameters columns to percent values
    cols_to_perct = df[['instrumentalness','speechiness','acousticness','liveness',
                        'danceability','energy','valence']]
    for col in cols_to_perct:
        df[col] = df[col] * 100
        df[col] = df[col].apply(lambda x: f"{x:.2f} %")

    return df

# Clean invalid time signatures ( 0/4 and 1/4)
def clean_time_signature(ts):
    if (ts == '0/4') | (ts == '1/4'):
        return None
    else:
        return ts
            

# Let's clean loudness column
def categorized_loudness(ld):
    if ld > 0:
        ld = 0
    elif ld < -60:
        ld = -60
    else:
        if (ld >= -12) and (ld < 0):
            return "Very loud"
        elif (ld >= -24) and (ld < -12):
            return "Loud"
        elif (ld >= -36) and (ld < -24):
            return "Moderatly quiet"
        elif (ld >= -45) and (ld < -36):
            return "Quiet"
        elif (ld > -56) and (ld < -45):
            return "Very quiet"

df['loudness'] = df['loudness'].apply(categorized_loudness)
df['time_signature']= df['time_signature'].apply(clean_time_signature)
df = clean_data(df)

# cleaning invalid time signatures is also important for the duplicated dataframe
same_df['time_signature']= df['time_signature'].apply(clean_time_signature)

df.head()


Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,duration_sec,duration_min
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,61.10 %,38.90 %,99373,91.00 %,0.00 %,C#,34.60 %,Very loud,Major,5.25 %,166.969,4/4,81.40 %,99.37 s,1.66 min
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,24.60 %,59.00 %,137373,73.70 %,0.00 %,F#,15.10 %,Very loud,Minor,8.68 %,174.003,4/4,81.60 %,137.37 s,2.29 min
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,95.20 %,66.30 %,170267,13.10 %,0.00 %,C,10.30 %,Loud,Minor,3.62 %,99.488,5/4,36.80 %,170.27 s,2.84 min
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,70.30 %,24.00 %,152427,32.60 %,0.00 %,C#,9.85 %,Loud,Major,3.95 %,171.758,4/4,22.70 %,152.43 s,2.54 min
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,95.00 %,33.10 %,82625,22.50 %,12.30 %,F,20.20 %,Loud,Major,4.56 %,140.576,4/4,39.00 %,82.62 s,1.38 min


In [None]:
same_df['time_signature'].unique()
df['time_signature'].unique()

In [19]:
df = df.drop(columns='duration_ms')
df

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,duration_sec,duration_min
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,61.10 %,38.90 %,91.00 %,0.00 %,C#,34.60 %,Very loud,Major,5.25 %,166.969,4/4,81.40 %,99.37 s,1.66 min
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,24.60 %,59.00 %,73.70 %,0.00 %,F#,15.10 %,Very loud,Minor,8.68 %,174.003,4/4,81.60 %,137.37 s,2.29 min
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,95.20 %,66.30 %,13.10 %,0.00 %,C,10.30 %,Loud,Minor,3.62 %,99.488,5/4,36.80 %,170.27 s,2.84 min
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,70.30 %,24.00 %,32.60 %,0.00 %,C#,9.85 %,Loud,Major,3.95 %,171.758,4/4,22.70 %,152.43 s,2.54 min
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,95.00 %,33.10 %,22.50 %,12.30 %,F,20.20 %,Loud,Major,4.56 %,140.576,4/4,39.00 %,82.62 s,1.38 min
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232720,Soul,Slave,Son Of Slide,2XGLdVl7lGeq8ksM6Al7jT,39,0.38 %,68.70 %,71.40 %,54.40 %,D,8.45 %,Very loud,Major,3.16 %,115.542,4/4,96.20 %,326.24 s,5.44 min
232721,Soul,Jr Thomas & The Volcanos,Burning Fire,1qWZdkBl4UVPj9lK6HuuFM,38,3.29 %,78.50 %,68.30 %,0.09 %,E,23.70 %,Very loud,Minor,3.37 %,113.830,4/4,96.90 %,282.45 s,4.71 min
232722,Soul,Muddy Waters,(I'm Your) Hoochie Coochie Man,2ziWXUmQLrXTiYjCg2fZ2t,47,90.10 %,51.70 %,41.90 %,0.00 %,D,9.45 %,Very loud,Major,14.80 %,84.135,4/4,81.30 %,166.96 s,2.78 min
232723,Soul,R.LUM.R,With My Words,6EFsue2YbIG4Qkq8Zr9Rir,44,26.20 %,74.50 %,70.40 %,0.00 %,A,33.30 %,Very loud,Major,14.60 %,100.031,4/4,48.90 %,222.44 s,3.71 min


> ### *Analyzing data*

In [None]:
# After exploring data, it's time to use it to answer questions

In [None]:
# What are the top 5 popular genres ? what are their names and who are their owners?

popular_grs = df[['artist_name','genre','popularity']].sort_values(by='popularity',ascending=False).set_index(df['track_name'])
print(popular_grs.head(5))

In [None]:
# What are the top 5 common keys and their mode ?

results = df.groupby('key')['mode'].agg(lambda x: x.mode()) # .mode() is a method used to find the most common values
most_common_keys = df['key'].value_counts().head(10).index

df_common_keys = results.loc[most_common_keys]

print(df_common_keys)

> ### *Visualizing data*

In [None]:
# This is a heatmap to represent correlation between multiple parameters

#temporary dataframe:(from the initial data)
temp_df = same_df[['popularity','acousticness','danceability','duration_ms','energy','instrumentalness',
            'liveness','loudness','speechiness','tempo','time_signature','valence']]
corr_matrix = temp_df.corr(numeric_only=True)
mask = np.triu(np.ones_like(corr_matrix))

fig, ax= plt.subplots()
sns.heatmap(data=corr_matrix,cmap='YlGnBu',annot=True,
            fmt=".1f",mask=mask)
ax.set_title('Correlation between parameters of tracks')

In [None]:
sns.set_style('darkgrid')
fig, ax = plt.subplots(figsize=(28,20))
sns.barplot(data=same_df,x='popularity',y='genre',palette='viridis')

ax.set_title('Tracks popularity by genre',fontsize=28)
ax.set_xlabel('Popularity',fontsize=20)
ax.set_ylabel('Genre',fontsize=20)
ax.tick_params(axis='y',labelsize=20,labelcolor='midnightblue')

In [None]:
# Because using the dataframe that contains all the track (232725 track), we'll work just with a sample
same_df_sample = same_df.sample(5000) # 5000 random track
same_df_low_sample = same_df.sample(500) # 500 random track

In [None]:
sns.set_style('darkgrid')
fig, ax = plt.subplots()
sns.stripplot(data=same_df_low_sample,x='key',y='popularity',palette='icefire',
                hue='mode')
ax.set_title('tracks popularity')

In [None]:
sns.set_style('whitegrid')
fig, ax = plt.subplots()
sns.kdeplot(data=same_df_sample,x='tempo',y='popularity',fill=True,cmap="viridis")
ax.set_title('Popularity according to tempo')

> #### **Extracting the cleaned data**

In [20]:
df.to_csv('clean_spotify_features_data.csv')