# 2020 Top 50 Spotify Tracks Processing with Numpy and Pandas
## About the Dataset
### context
Top 50 most streamed tracks on Spotify in 2020. This dataset has various variables regarding these songs.
### content
50 songs and 16 features

In [67]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
def load_data():
    '''
        Loads spotify top 50 tracks
    '''
    return pd.read_csv("Data/spotifytoptracks.csv", index_col = 0)

In [4]:
#loading data
spotify_top_tracks = load_data()

### Cleaning Data
<ul>
    <li>Handling Missing Values</li>
    <li>Removing duplicate features and samples</li>
    <li>Treating outliers</li>
</ul>
    



In [52]:
#Handle misssing values
spotify_top_tracks.info()

The above output shows there are 50 observations of 16 variables. <br>
All the variables have 50 records, indicating there's no missing value in the data. 

In [8]:
#remove duplicate features and samples
spotify_top_tracks.duplicated().sum()

The above output indicates that, the data does not contain duplicated records

### Treating outliers
Outliers are measure that significantly differ from other data points. <br>
For this method the following techniques are used to detect outliers

<ol>
    <li>Interquartile Range(IQR)</li>
    <li>Measure of Skewness</li>
    <li>visualization with Box Plot</li>
    <li>visualization with Histogram</li>
</ol>


#### Interquartile Range (IQR)
IQR is a measure of statistical dispersion. <br>
$$ IQR = Q3-Q1 $$ 
Where Q1, and Q3 represents 25th and 75th percentile respectively <br>
A data point <b><i>X</i></b> is outlier if $$ (X < (Q1 - 1.5 * IQR))  or  (X > Q3 + (1.5 * IQR)) $$


In [9]:
#determine IQR
Q1 = spotify_top_tracks.quantile(0.25)
Q3 = spotify_top_tracks.quantile(0.75)
IQR = Q3 - Q1

In [28]:
# find a data point that falls in range of outlier (those data's will be represented as True)
IQR_result = ((spotify_top_tracks < (Q1 - 1.5 * IQR)) |(spotify_top_tracks > (Q3 + 1.5 * IQR)))
IQR_result[IQR_result == True].count()

IQR outlier analysis shows that,column acousticness,danceability, duration_ms, instrumentalness, liveness, loudness, and speechiness contians a data point that are outliers. 

#### Measure of Skewness
Skewness explains the extent to which the data is normally distributed. <br>
Ideally, the skewness value should be between -1 and +1, and any major deviation from this range indicates the presence of extreme values. 

In [30]:
#select columns whose data types are float64 and int64
res = spotify_top_tracks.select_dtypes(include=['int64', 'float64']).skew()
left_skewed = res.where(res < -1).dropna().index
right_skewed = res.where(res > 1).dropna().index
print(left_skewed)
print(right_skewed)

##### Visualizing outliers using Boxplot

In [47]:
for cols in spotify_top_tracks.select_dtypes(include=['int64', 'float64']).columns:
    print(cols)    
    plt.boxplot(spotify_top_tracks[cols])
    plt.show()


#### Visualizing outliers using Histogram

In [51]:
for cols in spotify_top_tracks.select_dtypes(include=['int64', 'float64']).columns:
    print(cols)
    spotify_top_tracks[cols].hist()

### Exploratory Data Analysis
#### Number of Observations and features

In [59]:
observation_no, features_no = spotify_top_tracks.shape
print(f'Number of observation is {observation_no}')
print(f'Number of features is {features_no}')

#### Identifying Categorical and Numerical Features

In [65]:
numerical_features = spotify_top_tracks._get_numeric_data().columns
print("Numerical Features")
numerical_features

In [66]:
print("Categorial Features")
set(spotify_top_tracks) - set(numerical_features)

#### Artists with more than one popular track and their tracks

In [69]:
counter = Counter(spotify_top_tracks['artist'])
famous_artist = [name for name in counter if counter[name] > 1]        # artist with more than one popular track
spotify_top_tracks.query("artist in @famous_artist")[['artist', 'track_name']].sort_values('artist')

#### Most popular artist

In [77]:
#most popular artist
spotify_top_tracks['artist'].mode()

#### Total Number of artists who have their songs in the top 50

In [79]:
spotify_top_tracks['artist'].unique().size

#### Albums with more than one popular track

In [82]:
counter = Counter(spotify_top_tracks['album'])
famous_album = [album for album in counter if counter[album] > 1]        # albums with more than one popular track
spotify_top_tracks.query("album in @famous_album")[['album', 'track_name']].sort_values('album')

#### Total Number of Albums whose songs are in the top 50

In [84]:
spotify_top_tracks['album'].unique().size

#### Tracks whose danceability score is above 0.7

In [86]:
#danceability greater than 0.7
spotify_top_tracks.query("danceability > 0.7")[['track_name', 'danceability']]

#### Tracks whose danceability score is below 0.4

In [88]:
#danceability less than 0.7
spotify_top_tracks.query("danceability < 0.4")[['track_name', 'danceability']]

Unnamed: 0,track_name,danceability
44,lovely (with Khalid),0.351


#### Tracks whose loudness score is  above -5

In [89]:
#loudness score above -5
spotify_top_tracks.query("loudness > -5")[['track_name', 'loudness']]

Unnamed: 0,track_name,loudness
4,Don't Start Now,-4.521
6,Watermelon Sugar,-4.209
10,Tusa,-3.28
12,Circles,-3.497
16,Before You Go,-4.858
17,Say So,-4.577
21,Adore You,-3.675
23,Mood (feat. iann dior),-3.558
31,Break My Heart,-3.434
32,Dynamite,-4.41


#### Tracks whose loudness score is below -8

In [90]:
spotify_top_tracks.query("loudness < -8")[['track_name', 'loudness']]

Unnamed: 0,track_name,loudness
7,death bed (coffee for your head),-8.765
8,Falling,-8.756
15,Toosie Slide,-8.82
20,Savage Love (Laxed - Siren Beat),-8.52
24,everything i wanted,-14.454
26,bad guy,-10.965
36,HIGHEST IN THE ROOM,-8.764
44,lovely (with Khalid),-10.109
47,If the World Was Ending - feat. Julia Michaels,-10.086


#### Longest Track

In [None]:
spotify_top_tracks.loc[spotify_top_tracks['duration_ms'].idxmax()]

#### Shortest Track

In [94]:
spotify_top_tracks.loc[spotify_top_tracks['duration_ms'].idxmin()]

artist                            24kGoldn
album               Mood (feat. iann dior)
track_name          Mood (feat. iann dior)
track_id            3tjFYV6RSFtuktYl3ZtYcq
energy                               0.722
danceability                           0.7
key                                      7
loudness                            -3.558
acousticness                         0.221
speechiness                         0.0369
instrumentalness                       0.0
liveness                             0.272
valence                              0.756
tempo                               90.989
duration_ms                         140526
genre                              Pop rap
Name: 23, dtype: object

#### Popular genre

In [95]:
#most popular genre
spotify_top_tracks['genre'].mode()

0    Pop
Name: genre, dtype: object

#### Total Number of genres represented in top 50

In [96]:
spotify_top_tracks['genre'].unique().size

16

#### Strongly positively correlated features

#### Strongly negatively correlated features

#### Features that are not correlated