In [1]:
import os
from IPython.display import display
import pandas as pd
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.cluster import KMeans
# from preprocessing import load_data
# from ydata_profiling import ProfileReport
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
%matplotlib inline

In [10]:
# df = load_data(file_path='../data/dataset.csv')
PATH = os.getcwd()
df = pd.read_csv(os.path.join(PATH, "data/music_data.csv"))
# df.drop(["music_id"], axis=1, inplace=True)
df.head()

Unnamed: 0,user_id,age,gender,music,artist_name,featured_artists,genre,plays,duration,music_id,...,mode,speechiness,instrumentalness,liveness,valence,tempo,time_signature,explicit,rating,age_group
0,83811,16,F,Bank Account,21 Savage,"Birdy, Zoé",Dark Trap,11,3.67,2fQrGHiQOvpL9UgPvtYy6G,...,0,0.351,7e-06,0.0871,0.376,75.016,4.0,True,1,Gen Z
1,83811,16,F,Little Talks,Of Monsters and Men,"Ninho, Snoop Dogg, Russ, Paramore",Unknown,686,4.44,2ihCaVdNZmnHZWt0fvAM7B,...,1,0.0322,0.0,0.2845,0.413,101.8905,4.0,False,2,Gen Z
2,13397,17,M,Wherever I Go,OneRepublic,"Keith Urban, DJ Khaled, NIKI, MF DOOM",Unknown,136,2.83,46jLy47W8rkf8rEX04gMKB,...,1,0.0894,0.043432,0.273133,0.404183,120.8505,4.0,False,1,Gen Z
3,70645,44,M,No New Friends,DJ Khaled,"The xx, LIT killah",Pop,230,5.14,5oVlbbiKGdGeZkWCFy0mqk,...,1,0.167,0.0,0.0966,0.4,109.283,4.0,False,1,Gen X
4,70645,44,M,Dreams,Campsite Dream,,Country,391,3.2,1SNoSoQ3JZldOhzBY9gw0n,...,1,0.078837,0.235527,0.180354,0.380815,120.488479,4.0,False,2,Gen X


In [11]:
# Define presets for data visualization
def presets():
    plt.rc('figure', autolayout = True)

    plt.rc('axes',
        labelsize = 'large',
        labelweight = 'bold',
        titlesize = 14,
        titleweight = 'bold',
        titlepad = 10
    )

    %config InlineBackend.figure_format = 'retina'
    pd.options.display.max_rows = 10
    sns.set_style('darkgrid')
    # Set preset for darkmode for matplotlib and seaborn visualizations
    plt.style.use('dark_background')

    warnings.filterwarnings(action = 'ignore', category = UserWarning)
    return

presets()



In [12]:
def check_missing_value():
    # show number of missing values as a dataframe by column
    missing_values = df.isnull().sum()
    missing_values = missing_values[missing_values > 0]
    missing_values = missing_values.sort_values(ascending=False)
    missing_values = missing_values.reset_index()
    missing_values.columns = ["Feature", "Missing Values"]
    return missing_values


check_missing_value()

Unnamed: 0,Feature,Missing Values
0,featured_artists,10356


In [13]:
# Fill featured_artists missing values with "Unknown"
# Fill featured_artists missing values with "Unknown"
df["featured_artists"].fillna("None", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["featured_artists"].fillna("None", inplace=True)


In [14]:
check_missing_value()

Unnamed: 0,Feature,Missing Values


In [15]:
import numpy as np

# User features
df["log_plays"] = np.log1p(df["plays"])  # Normalize plays
df["user_activity"] = df.groupby("user_id")["plays"].transform("sum")

# Item features
df["artist_popularity"] = df.groupby("artist_name")["plays"].transform("sum")
df["song_popularity"] = df.groupby("music")["plays"].transform("sum")

In [16]:
df.head(2)

Unnamed: 0,user_id,age,gender,music,artist_name,featured_artists,genre,plays,duration,music_id,...,valence,tempo,time_signature,explicit,rating,age_group,log_plays,user_activity,artist_popularity,song_popularity
0,83811,16,F,Bank Account,21 Savage,"Birdy, Zoé",Dark Trap,11,3.67,2fQrGHiQOvpL9UgPvtYy6G,...,0.376,75.016,4.0,True,1,Gen Z,2.484907,697,14709,6518
1,83811,16,F,Little Talks,Of Monsters and Men,"Ninho, Snoop Dogg, Russ, Paramore",Unknown,686,4.44,2ihCaVdNZmnHZWt0fvAM7B,...,0.413,101.8905,4.0,False,2,Gen Z,6.532334,697,6121,6455


In [18]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK data
# nltk.download('stopwords')
nltk.download("punkt")


# Preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        # Lowercase
        text = text.lower()
        # Tokenization
        tokens = nltk.word_tokenize(text)
        # Remove stop words
        stop_words = set(stopwords.words("english"))
        tokens = [word for word in tokens if word not in stop_words]
        # Stemming
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
        return " ".join(tokens)
    elif isinstance(text, list):
        return [preprocess_text(artist) for artist in text]
    else:
        return text


# Apply preprocessing to relevant columns
df["music_processed"] = df["music"].apply(preprocess_text)
df["artist_name_processed"] = df["artist_name"].apply(preprocess_text)
df["featured_artists_processed"] = df["featured_artists"].apply(preprocess_text)

df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     /home/jnopareboateng/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,user_id,age,gender,music,artist_name,featured_artists,genre,plays,duration,music_id,...,explicit,rating,age_group,log_plays,user_activity,artist_popularity,song_popularity,music_processed,artist_name_processed,featured_artists_processed
0,83811,16,F,Bank Account,21 Savage,"Birdy, Zoé",Dark Trap,11,3.67,2fQrGHiQOvpL9UgPvtYy6G,...,True,1,Gen Z,2.484907,697,14709,6518,bank account,21 savag,"birdi , zoé"
1,83811,16,F,Little Talks,Of Monsters and Men,"Ninho, Snoop Dogg, Russ, Paramore",Unknown,686,4.44,2ihCaVdNZmnHZWt0fvAM7B,...,False,2,Gen Z,6.532334,697,6121,6455,littl talk,monster men,"ninho , snoop dogg , russ , paramor"
2,13397,17,M,Wherever I Go,OneRepublic,"Keith Urban, DJ Khaled, NIKI, MF DOOM",Unknown,136,2.83,46jLy47W8rkf8rEX04gMKB,...,False,1,Gen Z,4.919981,136,14886,137,wherev go,onerepubl,"keith urban , dj khale , niki , mf doom"
3,70645,44,M,No New Friends,DJ Khaled,"The xx, LIT killah",Pop,230,5.14,5oVlbbiKGdGeZkWCFy0mqk,...,False,1,Gen X,5.442418,621,15533,2962,new friend,dj khale,"xx , lit killah"
4,70645,44,M,Dreams,Campsite Dream,,Country,391,3.2,1SNoSoQ3JZldOhzBY9gw0n,...,False,2,Gen X,5.971262,621,901,3614,dream,campsit dream,none


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming df is your pandas DataFrame with preprocessed text columns

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

# Fit and transform the text columns
tfidf_matrix = tfidf_vectorizer.fit_transform(df["music_processed"])

# Get the feature names
tfidf_features = tfidf_vectorizer.get_feature_names_out()

print(tfidf_matrix.shape)
print(tfidf_features)

(19974, 4881)
['000' '00000' '10' ... 'étai' 'été' '멘붕']


In [20]:
from scipy.sparse import csr_matrix

# Convert TF-IDF matrix to a dense array
tfidf_array = tfidf_matrix.toarray()

# Create new columns for TF-IDF features
for i, feature in enumerate(tfidf_features):
    df[f"music_tfidf_{feature}"] = tfidf_array[:, i]

  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] = tfidf_array[:, i]
  df[f'music_tfidf_{feature}'] 

In [21]:
df.columns

Index(['user_id', 'age', 'gender', 'music', 'artist_name', 'featured_artists',
       'genre', 'plays', 'duration', 'music_id',
       ...
       'music_tfidf_zillionair', 'music_tfidf_zimbabw', 'music_tfidf_zion',
       'music_tfidf_zombi', 'music_tfidf_zone', 'music_tfidf_zopilota',
       'music_tfidf_équinox', 'music_tfidf_étai', 'music_tfidf_été',
       'music_tfidf_멘붕'],
      dtype='object', length=4914)

In [22]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the audio features
scaler = MinMaxScaler()
audio_features = [
    "acousticness",
    "danceability",
    "duration",
    "energy",
    "key",
    "loudness",
    "mode",
    "speechiness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
    "time_signature",
    "explicit",
]

df[audio_features] = scaler.fit_transform(df[audio_features])