# Music Recommendation System

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

## Load data

* **Position** - Spotify Ranking
* **Artist Name** - Artist Name
* **Song Name** - Song Name
* **Days** - No of days since the release of the song (Updated)
* **Top 10 (xTimes)** - No of times inside top 10
* **Peak Position** - Peak position attained
* **Peak Position (xTimes)** - No of times Peak position attained
* **Peak Streams** - Total no of streams during Peak position 
* **Total Streams** - Total song streams

In [2]:
about_data = {
    "Position": "Spotify Ranking",
    "Artist Name": "Artist Name",
    "Song Name": "Song Name",
    "Days": "No of days since the release of the song (Updated)",
    "Top 10 (xTimes)": "No of times inside top 10",
    "Peak Position": "Peak position attained",
    "Peak Position (xTimes)": "No of times Peak position attained",
    "Peak Streams": "Total no of streams during Peak position ",
    "Total Streams": "Total song streams",
}

In [3]:
df = pd.read_csv("data/song-dataset.csv", low_memory=False)[:1000]

df.head()

Unnamed: 0,Position,Artist Name,Song Name,Days,Top 10 (xTimes),Peak Position,Peak Position (xTimes),Peak Streams,Total Streams
0,1,Post Malone,Sunflower SpiderMan: Into the SpiderVerse,1506,302.0,1,(x29),2118242,883369738
1,2,Juice WRLD,Lucid Dreams,1673,178.0,1,(x20),2127668,864832399
2,3,Lil Uzi Vert,XO TOUR Llif3,1853,212.0,1,(x4),1660502,781153024
3,4,J. Cole,No Role Modelz,2547,6.0,7,0,659366,734857487
4,5,Post Malone,rockstar,1223,186.0,1,(x124),2905678,718865961


In [5]:
df.head()

Unnamed: 0,Position,Artist Name,Song Name,Days,Top 10 (xTimes),Peak Position,Peak Position (xTimes),Peak Streams,Total Streams
0,1,Post Malone,Sunflower SpiderMan: Into the SpiderVerse,1506,302.0,1,(x29),2118242,883369738
1,2,Juice WRLD,Lucid Dreams,1673,178.0,1,(x20),2127668,864832399
2,3,Lil Uzi Vert,XO TOUR Llif3,1853,212.0,1,(x4),1660502,781153024
3,4,J. Cole,No Role Modelz,2547,6.0,7,0,659366,734857487
4,5,Post Malone,rockstar,1223,186.0,1,(x124),2905678,718865961


## Clean the data

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Position                1000 non-null   int64  
 1   Artist Name             1000 non-null   object 
 2   Song Name               1000 non-null   object 
 3   Days                    1000 non-null   int64  
 4   Top 10 (xTimes)         1000 non-null   float64
 5   Peak Position           1000 non-null   int64  
 6   Peak Position (xTimes)  1000 non-null   object 
 7   Peak Streams            1000 non-null   int64  
 8   Total Streams           1000 non-null   int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 70.4+ KB


In [7]:
about_data

{'Position': 'Spotify Ranking',
 'Artist Name': 'Artist Name',
 'Song Name': 'Song Name',
 'Days': 'No of days since the release of the song (Updated)',
 'Top 10 (xTimes)': 'No of times inside top 10',
 'Peak Position': 'Peak position attained',
 'Peak Position (xTimes)': 'No of times Peak position attained',
 'Peak Streams': 'Total no of streams during Peak position ',
 'Total Streams': 'Total song streams'}

In [8]:
df.columns

Index(['Position', 'Artist Name', 'Song Name', 'Days', 'Top 10 (xTimes)',
       'Peak Position', 'Peak Position (xTimes)', 'Peak Streams',
       'Total Streams'],
      dtype='object')

In [9]:
# Check are there any duplicated song names in the dataframe
df.duplicated(subset="Song Name").sum()

23

In [10]:
# Remove duplicates
df = df.drop_duplicates(subset="Song Name")

df.duplicated(subset="Song Name").sum()

0

In [11]:
df.isna().sum()

Position                  0
Artist Name               0
Song Name                 0
Days                      0
Top 10 (xTimes)           0
Peak Position             0
Peak Position (xTimes)    0
Peak Streams              0
Total Streams             0
dtype: int64

In [12]:
df.isnull().sum()

Position                  0
Artist Name               0
Song Name                 0
Days                      0
Top 10 (xTimes)           0
Peak Position             0
Peak Position (xTimes)    0
Peak Streams              0
Total Streams             0
dtype: int64

In [13]:
# Remove null values
df = df.dropna()

df.isna().sum()

Position                  0
Artist Name               0
Song Name                 0
Days                      0
Top 10 (xTimes)           0
Peak Position             0
Peak Position (xTimes)    0
Peak Streams              0
Total Streams             0
dtype: int64

## Processing the Data

In [14]:
df.head(10)

Unnamed: 0,Position,Artist Name,Song Name,Days,Top 10 (xTimes),Peak Position,Peak Position (xTimes),Peak Streams,Total Streams
0,1,Post Malone,Sunflower SpiderMan: Into the SpiderVerse,1506,302.0,1,(x29),2118242,883369738
1,2,Juice WRLD,Lucid Dreams,1673,178.0,1,(x20),2127668,864832399
2,3,Lil Uzi Vert,XO TOUR Llif3,1853,212.0,1,(x4),1660502,781153024
3,4,J. Cole,No Role Modelz,2547,6.0,7,0,659366,734857487
4,5,Post Malone,rockstar,1223,186.0,1,(x124),2905678,718865961
5,6,Travis Scott,goosebumps,1995,4.0,8,0,977275,672972704
6,7,The Weeknd,Blinding Lights,1100,233.0,1,(x11),2355059,644287953
7,8,XXXTENTACION,Jocelyn Flores,1673,44.0,2,(x1),3175206,624457164
8,9,XXXTENTACION,SAD!,1217,133.0,1,(x6),4437612,619879245
9,10,Juice WRLD,All Girls Are The Same,1681,2.0,5,0,1239152,613872384


### Remove spaces

Let's remove white spaces from all artist names.

In [15]:
def remove_spaces(df_column):
    """
    Removing all value space from the dataframe column
    
    Args:
        df_column: the column of the dataframe you want to remove
    
    Return:
        column
    """
    column = str(df_column).replace(" ", "")
    
    return column
                  

In [16]:
df["Artist Name"] = df["Artist Name"].apply(remove_spaces)

In [17]:
df.head(10)

Unnamed: 0,Position,Artist Name,Song Name,Days,Top 10 (xTimes),Peak Position,Peak Position (xTimes),Peak Streams,Total Streams
0,1,PostMalone,Sunflower SpiderMan: Into the SpiderVerse,1506,302.0,1,(x29),2118242,883369738
1,2,JuiceWRLD,Lucid Dreams,1673,178.0,1,(x20),2127668,864832399
2,3,LilUziVert,XO TOUR Llif3,1853,212.0,1,(x4),1660502,781153024
3,4,J.Cole,No Role Modelz,2547,6.0,7,0,659366,734857487
4,5,PostMalone,rockstar,1223,186.0,1,(x124),2905678,718865961
5,6,TravisScott,goosebumps,1995,4.0,8,0,977275,672972704
6,7,TheWeeknd,Blinding Lights,1100,233.0,1,(x11),2355059,644287953
7,8,XXXTENTACION,Jocelyn Flores,1673,44.0,2,(x1),3175206,624457164
8,9,XXXTENTACION,SAD!,1217,133.0,1,(x6),4437612,619879245
9,10,JuiceWRLD,All Girls Are The Same,1681,2.0,5,0,1239152,613872384


### Combine required columns and drop the others

In [18]:
# Collected list of nonrequired column names
nonrequired_column = list()

for column in df.columns[3:]:
    nonrequired_column.append(column)
    
nonrequired_column

['Days',
 'Top 10 (xTimes)',
 'Peak Position',
 'Peak Position (xTimes)',
 'Peak Streams',
 'Total Streams']

In [19]:
# Drop the nonrequired columns from dataframe
df = df.drop(nonrequired_column, axis=1)

df.head()

Unnamed: 0,Position,Artist Name,Song Name
0,1,PostMalone,Sunflower SpiderMan: Into the SpiderVerse
1,2,JuiceWRLD,Lucid Dreams
2,3,LilUziVert,XO TOUR Llif3
3,4,J.Cole,No Role Modelz
4,5,PostMalone,rockstar


Finally, let's combine those three column into single one and assign as new column

In [20]:
df["data"] = df[df.columns].apply(lambda x: " ".join(x.astype(str)), axis=1)

df.head()

Unnamed: 0,Position,Artist Name,Song Name,data
0,1,PostMalone,Sunflower SpiderMan: Into the SpiderVerse,1 PostMalone Sunflower SpiderMan: Into the Sp...
1,2,JuiceWRLD,Lucid Dreams,2 JuiceWRLD Lucid Dreams
2,3,LilUziVert,XO TOUR Llif3,3 LilUziVert XO TOUR Llif3
3,4,J.Cole,No Role Modelz,4 J.Cole No Role Modelz
4,5,PostMalone,rockstar,5 PostMalone rockstar


## Building recommendation model

In [21]:
vectorizer = CountVectorizer()
vectorized = vectorizer.fit_transform(df["data"])

In [22]:
print(vectorized[:5])

  (0, 1965)	1
  (0, 2206)	1
  (0, 2161)	1
  (0, 1569)	1
  (0, 2257)	1
  (0, 2162)	1
  (1, 1611)	1
  (1, 1726)	1
  (1, 1316)	1
  (2, 1688)	1
  (2, 2432)	1
  (2, 2314)	1
  (2, 1700)	1
  (3, 1234)	1
  (3, 1866)	1
  (3, 2047)	1
  (3, 1799)	1
  (4, 1965)	1
  (4, 2043)	1


In [23]:
similarities = cosine_similarity(vectorized)

print(similarities)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [24]:
df_tmp = pd.DataFrame(similarities, columns=df["Song Name"], index=df["Song Name"]).reset_index()

df_tmp.head()

Song Name,Song Name.1,Sunflower SpiderMan: Into the SpiderVerse,Lucid Dreams,XO TOUR Llif3,No Role Modelz,rockstar,goosebumps,Blinding Lights,Jocelyn Flores,SAD!,...,Signs,Everyday We Lit,We're Good,Cry Baby,Close Friends (Lil Baby & Gunna),AOK,Rodeo,Outside Today,Question...?,Safaera
0,Sunflower SpiderMan: Into the SpiderVerse,1.0,0.0,0.0,0.0,0.288675,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Lucid Dreams,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,XO TOUR Llif3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,No Role Modelz,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,rockstar,0.288675,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
input_song = "Signs"
recommendations = pd.DataFrame(df_tmp.nlargest(11, input_song)["Song Name"])

recommendations = recommendations[recommendations["Song Name"] != input_song]

print(recommendations)

                                    Song Name
87                                    Nonstop
107  Money In The Grave (Drake ft. Rick Ross)
126                              Passionfruit
147                                   Jumpman
222                                 Controlla
329                                  Portland
451                                 I'm Upset
568                                    Energy
594                               Gyalchester
698                                    Legend
