In [None]:
!pip install kagglehub --upgrade

In [None]:
import kagglehub

In [None]:
# download the dataset from kaggle

kagglehub.dataset_download("undefinenull/million-song-dataset-spotify-lastfm")

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data_path = Path("/root/.cache/kagglehub/datasets/undefinenull/million-song-dataset-spotify-lastfm/versions/1")


songs_data_path = data_path / 'Music Info.csv'
users_data_path = data_path / 'User Listening History.csv'

# Songs Dataset

In [None]:
# load the songs data

df_songs = pd.read_csv(songs_data_path)
df_songs.head()

## Preliminary Analysis

In [None]:
# size of dataset

df_songs.shape

In [None]:
# data info

df_songs.info()

In [None]:
# drop columns from data

columns_to_drop = ["spotify_preview_url"]

df_songs.drop(columns=columns_to_drop,inplace=True)

df_songs.head()

### Missing Values

In [None]:
# missing values

df_songs.isna().sum()

In [None]:
import missingno as msno

In [None]:
msno.matrix(df_songs)

In [None]:
# ratio of missing values in data

(
    df_songs
    .isna()
    .mean()
    .sort_values(ascending=False)
    .head(2)
    .mul(100)
)

### Duplicates in Data

In [None]:
# check for duplicates based on name of the song

(
    df_songs
    .assign(name=df_songs['name'].str.lower())
    .duplicated(subset='name')
    .sum()
)

**There are 815 duplicate rows based on the name of the song**

In [None]:
# rows that are duplicate

(
    df_songs
    .loc[
        df_songs
        .assign(name=df_songs['name'].str.lower())
        .duplicated(subset='name',keep=False)
    ]
    .assign(name=df_songs['name'].str.lower())
    .sort_values("name")
)

In [None]:
# duplicates in the data based on spotify_id

(
    df_songs
    .duplicated(subset="spotify_id")
    .sum()
)

In [None]:
# duplicate songs in the data

(
    df_songs
    .duplicated(subset=["spotify_id","year","duration_ms"])
    .sum()
)

**9 Duplicate songs in the data**

In [None]:
# rows that have duplicate data

(
    df_songs
    .loc[
        df_songs
        .duplicated(subset=["spotify_id","year","duration_ms"],keep=False)
    ]
    .sort_values(["spotify_id","year","duration_ms"])
)

In [None]:
# drop duplicates

df_songs.drop_duplicates(subset=["spotify_id","year","duration_ms"],inplace=True)

In [None]:
# check for duplicates

(
    df_songs
    .duplicated(subset=["spotify_id","year","duration_ms"])
    .sum()
)

**Duplicate songs have been removed from the data**

## Column Wise Analysis

### Categorical Columns

In [None]:
# list of columns in data

df_songs.columns

In [None]:
df_songs.dtypes

In [None]:
# shape of data

df_songs.shape

In [None]:
# Categorical Columns

categorical_features = df_songs.select_dtypes(include="object").columns
categorical_features

In [None]:
def categorical_analysis(df,feature_names,k_artists=15):
    for feature in feature_names:
        print(f"Number of categories in column {feature} are ",df[feature].str.lower().nunique())

        if feature in ["artist","genre"]:
            print(df[feature].value_counts().head(k_artists))

        if feature == "genre":
            print(f"The unique categories in {feature} column are: ", df[feature].dropna().unique())
        print("#" * 75)

In [None]:
# perform catagorical analysis

categorical_analysis(df_songs,categorical_features)

**Observations**:  
1. The `Track ID` and `Spotify ID`'s are unique for every row.
2. The song names have some repititions. That is because there are a few songs in the data whose name is the same but is sung by a different artist.
3. The dataset contains songs of about `8317 artists`.
4. There are 15 distinct categories in the `Genre` column.

In [None]:
# countplot of genre

sns.countplot(df_songs,x="genre")
plt.xticks(rotation=90)
plt.show()

In [None]:
# genre group

genre_group = df_songs.groupby("genre")

genre_group[['genre','tags']].sample(3)

In [None]:
# song titles in the data that are not in english

(   df_songs
    .loc[
        df_songs
        .loc[:,"name"]
        .str.contains("[^\d\w\s.?!':;-_(){},\.#-&/-]")
    ]
)

In [None]:
# artists in the data that are not in english

(   df_songs
    .loc[
        df_songs
        .loc[:,"artist"]
        .str.contains("[^\d\w\s.?!':;-_\(\)\{\},\.#-+&\/\-\"]")
    ]
)

In [None]:
df_songs['tags'][0] #rock, alternative, indie, alternative_rock, in

In [None]:
all_tags = []

for tags in df_songs["tags"].dropna().str.replace(" ","").str.split(","):
    all_tags.extend(tags)

In [None]:
print("The number of unique tags are ",len(set(all_tags)))

In [None]:
set(all_tags)

In [None]:
# unique tags in the data

(
    df_songs
    .loc[:,"tags"]
    .dropna()
    .str.split(",")
    .explode()
    .str.strip()
    .unique()
)

### Integer Based Columns

**Key**  

**What it means:** The musical key in which the song is composed, represented as integers (e.g., 0 = C, 1 = C#, 2 = D, etc., up to 11).  
- **When value is small:** Represents keys like C or D, often associated with simplicity or familiarity.  
- **When value is large:** Represents keys like A# or B, which may sound brighter or more complex, depending on the context.  

---

**Mode**  

**What it means:** Refers to the modality of the song, where 0 = minor (often associated with sadness or tension) and 1 = major (associated with happiness or resolution).  
- **When value is small:** Indicates the song is in a minor mode, creating a more somber or serious tone.  
- **When value is large:** Indicates the song is in a major mode, giving it a cheerful and uplifting feel.  

---

**Time Signature**  

**What it means:** The number of beats in each bar of music, typically expressed as an integer (e.g., 4 for 4/4 time).  
- **When value is small:** Indicates fewer beats per measure (e.g., 3), which can create a waltz-like or less conventional rhythm.  
- **When value is large:** Indicates more beats per measure (e.g., 5 or 7), often leading to a more complex or experimental rhythmic structure.


> A value of **4** in the time signature typically means the song is in **4/4 time**, also known as "common time." This means there are **4 beats per measure**, and the quarter note gets one beat.

>**Impact of a 4 Time Signature:**  
- **Musical Feel:** It creates a steady, balanced, and natural rhythm that is easy to follow.  
- **Prevalence:** It is the most common time signature in Western music, used in many genres like pop, rock, classical, and jazz.  
- **Examples:** Songs like "Billie Jean" by Michael Jackson or "Shape of You" by Ed Sheeran are in 4/4.

In [None]:
integer_columns = df_songs.select_dtypes(include="int").columns
integer_columns

In [None]:
df_songs[integer_columns]

In [None]:
# statistical summary

(
    df_songs
    .loc[:,integer_columns]
    .drop(columns=["duration_ms"])
    .assign(**{
        col: df_songs[col].astype("object")
        for col in integer_columns.drop("duration_ms")
    })
    .describe()
)

In [None]:
# range of data

(
    df_songs
    .loc[:,integer_columns]
    .assign(duration_minutes=df_songs["duration_ms"].div(1000).div(60))
    .drop(columns=["duration_ms"])
    .agg(["min","max"])
)

In [None]:
# number of songs per year in data

sns.histplot(df_songs,x="year",bins=df_songs["year"].max() - df_songs["year"].min(),stat="count")
plt.show()

In [None]:
# most songs from which year(top 5)

(
    df_songs
    .loc[:,"year"]
    .value_counts()
    .head(5)
    .sort_index()
)

-----

**Keys**

Musical notes are assigned key values from **0 to 11**, representing all the chromatic notes in an octave. Here’s the mapping:  

| **Key Value** | **Note** |  
|---------------|-----------|  
| 0             | C         |  
| 1             | C# / Db   |  
| 2             | D         |  
| 3             | D# / Eb   |  
| 4             | E         |  
| 5             | F         |  
| 6             | F# / Gb   |  
| 7             | G         |  
| 8             | G# / Ab   |  
| 9             | A         |  
| 10            | A# / Bb   |  
| 11            | B         |  

### Notes Explained:  
- **Sharps (#):** Raise the note by a semitone (e.g., C → C#).  
- **Flats (b):** Lower the note by a semitone (e.g., D → Db).  

This cycle repeats across octaves!

-----

**English vs Hindi Notes Comparison**

Here’s a table that maps **Key Value**, **English Notes**, and their corresponding **Hindi Music Notes (Swaras)**:  

| **Key Value** | **English Note** | **Hindi Music Note** |  
|---------------|------------------|-----------------------|  
| 0             | C                | Sa                   |  
| 1             | C# / Db          | Komal Re             |  
| 2             | D                | Re                   |  
| 3             | D# / Eb          | Komal Ga             |  
| 4             | E                | Ga                   |  
| 5             | F                | Ma                   |  
| 6             | F# / Gb          | Tivra Ma             |  
| 7             | G                | Pa                   |  
| 8             | G# / Ab          | Komal Dha            |  
| 9             | A                | Dha                  |  
| 10            | A# / Bb          | Komal Ni             |  
| 11            | B                | Ni                   |  

**Notes**:
- In **Hindi classical music**, "Komal" refers to flat (lowered) notes, and "Tivra" refers to sharp (raised) notes.
- The cycle repeats with "Sa" at the next octave!

-----

If a song has a key value of **2**, it means the song is composed in the **D major** or **D minor key**, depending on the **mode** (0 = minor, 1 = major).  

**Interpretation**:              
- **Key Value 2 (D):** The tonic note, or "home base," of the song is **D**. All other notes and chords in the song revolve around this note.
- **In Western Music:**  
  - **D Major:** Bright and uplifting.  
  - **D Minor:** Melancholic or introspective.
- **In Hindi Classical Music:** This corresponds to the "Re" swara (if D is treated as the Sa of the scale).

The overall mood or emotion of the song will depend on the mode and how the scale is used melodically and harmonically.

----------------
----------------

In [None]:
# unique values in the key column

(
    np.sort(df_songs
            .loc[:,"key"]
            .unique())
)

In [None]:
# percentage of songs wrt to key in the data

(
    df_songs['key']
    .value_counts(normalize=True)
    .mul(100)
    .sort_index()
    .plot(kind='barh',title="Percentage of Songs wrt to Key",xlabel="Percentage")
)

In [None]:
# countplot for mode

sns.countplot(df_songs,x="mode")
plt.show()

In [None]:
# unique values for time signature

(
    np.sort(df_songs
            .loc[:,"time_signature"]
            .unique())
)

In [None]:
# countplot for time signature

sns.countplot(df_songs,x="time_signature")
plt.show()

In [None]:
(
    df_songs['time_signature']
    .value_counts(normalize=True)
    .mul(100)
)

In [None]:
# statistical summary of time duration

(
    df_songs
    .loc[:,["duration_ms"]]
    .assign(duration_minutes=df_songs["duration_ms"].div(1000).div(60))
    .drop(columns="duration_ms")
    .describe()
)

In [None]:
# time duration histogram

time_duration_mins = df_songs["duration_ms"].div(1000).div(60)

sns.histplot(time_duration_mins)
plt.xlabel("Time Duration (mins)")
plt.show()

In [None]:
# time duration boxplot

sns.boxplot(time_duration_mins)
plt.ylabel("Time Duration (mins)")
plt.show()

In [None]:
# song that is longer than 60 mins

(
    df_songs
    .loc[time_duration_mins > 60]
)

### Continuous Columns

In [None]:
continuous_columns = df_songs.select_dtypes(include="float").columns
continuous_columns

**Danceability**
  
**What it means:** Measures how suitable a track is for dancing, based on tempo, rhythm stability, beat strength, and overall regularity.  
- **When value is small:** The song may feel less rhythmic or harder to dance to, like ballads or experimental music.  
- **When value is large:** The song is highly danceable, with a strong beat and rhythmic consistency, ideal for clubs or parties.  

---

**Energy**

**What it means:** Represents the intensity and activity of a song, considering factors like loudness, tempo, and instrumentation.  
- **When value is small:** The song feels mellow, calm, or laid-back, such as acoustic or chill tracks.  
- **When value is large:** The song feels loud, energetic, and lively, often found in rock or EDM genres.  

---

**Loudness**

**What it means:** The overall volume of the song in decibels (dB), averaged over the track.  
- **When value is small:** The song is quieter, suitable for soft or ambient styles.  
- **When value is large:** The song is louder and more powerful, typically associated with dynamic or energetic tracks.  

---

**Speechiness**

**What it means:** Measures the presence of spoken words in a track. A higher value indicates more speech-like content.  
- **When value is small:** The song is more musical, with fewer spoken or rap-like elements.  
- **When value is large:** The track has spoken word, podcast-style content, or heavy rap influence.  

---

**Acousticness**

**What it means:** Reflects how acoustic (non-electronic) the track is.  
- **When value is small:** The track is more electronic or synthetic in nature.  
- **When value is large:** The track is acoustic, featuring instruments like guitar, piano, or strings.  

---

**Instrumentalness**

**What it means:** Predicts the likelihood of a song having no vocals (purely instrumental).  
- **When value is small:** The track likely has vocals or lyrics.  
- **When value is large:** The track is primarily instrumental, like classical or ambient music.  

---

**Liveness**

**What it means:** Measures the presence of a live audience in the recording.  
- **When value is small:** The track sounds studio-produced without live ambiance.  
- **When value is large:** The track feels live, with audience sounds or a concert vibe.  

---

**Valence**

**What it means:** Indicates the positivity or happiness of a track's mood.  
- **When value is small:** The song feels sad, somber, or emotionally heavy.  
- **When value is large:** The song feels cheerful, uplifting, or happy.  

---

**Tempo**
  
**What it means:** The speed of the song in beats per minute (BPM).  
- **When value is small:** The song is slow-paced, such as ballads or downtempo tracks.  
- **When value is large:** The song is fast-paced, like dance or upbeat genres.  

In [None]:
def numerical_analysis(df,columns):
    for column in columns:
        print(f"Numerical Analysis for column {column}")
        print("Statistical Summary")
        print(df[column].describe())

        fig = plt.figure(figsize=(12,4))
        # hitogram for column
        plt.subplot(1,2,1)
        sns.histplot(df[column])
        plt.title(f"Histogram for {column}")
        # boxplot for column
        plt.subplot(1,2,2)
        sns.boxplot(df[column])
        plt.title(f"Boxplot for {column}")
        plt.show()

        print("#" * 120)
    print("*" * 120)
    print("Pairplot")
    sns.pairplot(df[columns])
    plt.show()

In [None]:
numerical_analysis(df_songs,continuous_columns)

# Users & Song Data

In [None]:
# load the dataset

df_users = pd.read_csv(users_data_path)

df_users.head()

In [None]:
# dataset info

df_users.info()

In [None]:
# check for duplicates

df_users.duplicated(subset=["track_id","user_id"]).sum()

**No duplicates in the data**

In [None]:
# check for missing values

df_users.isna().sum()

In [None]:
# unqiue users in the data

(
    df_users
    .loc[:,"user_id"]
    .nunique()
)

In [None]:
# unique songs in the data

(
    df_users
    .loc[:,"track_id"]
    .nunique()
)

In [None]:
# top 10 most played songs in user data

(
    df_users
    .loc[:,"track_id"]
    .value_counts()
    .head(10)
)


In [None]:
top_10_songs = (
    df_users
    .loc[:,"track_id"]
    .value_counts()
    .head(10)
)

top_10_songs

In [None]:
(
    df_songs
    .loc[df_songs["track_id"].isin(top_10_songs.index.tolist()),:]
)

In [None]:
# most playcounts for songs

top_10_played_songs = (
    df_users.groupby("track_id")['playcount']
    .agg("sum")
    .sort_values(ascending=False)
    .head(10)
)

top_10_played_songs


In [None]:
(
    df_songs
    .loc[df_songs["track_id"].isin(top_10_played_songs.index.tolist()),:]
)

In [None]:
pd.concat([top_10_songs,top_10_played_songs],axis=1)

In [None]:
# most diverse users
# top 10


most_diverse_users = (
                        df_users.groupby("user_id")['track_id']
                        .agg("count")
                        .sort_values(ascending=False)
                        .head(10)
                    )

most_diverse_users

In [None]:
# most playcounts for users
# top 10

most_active_users = (
                        df_users.groupby("user_id")['playcount']
                        .agg("sum")
                        .sort_values(ascending=False)
                        .head(10)
                    )

most_active_users

In [None]:
pd.concat([most_diverse_users,most_active_users],axis=1)