In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape
from tensorflow.keras.optimizers import Adam





In [3]:
# Load the dataset
data = pd.read_csv("Synthetic_Data_With_Spotify_MPD.csv")

# Display basic information about the dataset
print(data.info())
print(data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35001 entries, 0 to 35000
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   usersha1            35001 non-null  int64  
 1   age                 35001 non-null  int64  
 2   Education           35001 non-null  object 
 3   gender              35001 non-null  object 
 4   Name                35001 non-null  object 
 5   country             35001 non-null  object 
 6   Music               35001 non-null  object 
 7   artname             35001 non-null  object 
 8   featured_artists    15808 non-null  object 
 9   Genre               30578 non-null  object 
 10  plays               35001 non-null  int64  
 11  Artiste Popularity  35001 non-null  float64
 12  Audio Popularity    35001 non-null  float64
 13  Music Acousticness  35001 non-null  float64
 14  Danceability        35001 non-null  float64
 15  Energy              35001 non-null  float64
 16  Key 

In [4]:
# Check for missing values
missing_values = data.isnull().sum()

# Display missing values
print(missing_values)

usersha1                  0
age                       0
Education                 0
gender                    0
Name                      0
country                   0
Music                     0
artname                   0
featured_artists      19193
Genre                  4423
plays                     0
Artiste Popularity        0
Audio Popularity          0
Music Acousticness        0
Danceability              0
Energy                    0
Key                       0
Loudness                  0
Mode                      0
Speechiness               0
Instrumentalness          0
Liveness                  0
Valence                   0
Tempo                     0
Time Signature            0
Track Genre               0
Release Date              0
Explicit                  0
Duration                  0
ID                        0
id_artists                0
Followers                 0
dtype: int64


In [5]:
# Check for the presence of the 'Release Date' column
if 'Release Date' not in data.columns:
    print("Warning: 'Release Date' column is missing from the dataset.")
else:
    # Preprocessing steps
    data_cleaned = data.copy()

    # Handle missing values
    data_cleaned['featured_artists'].fillna('No Featured Artists', inplace=True)
    data_cleaned['Genre'].fillna('Unknown', inplace=True)

    # One-hot encoding categorical variables
    data_encoded = pd.get_dummies(data_cleaned, columns=['Education', 'gender', 'country', 'Genre'], drop_first=True)


In [6]:
data_cleaned.isnull().sum()

usersha1              0
age                   0
Education             0
gender                0
Name                  0
country               0
Music                 0
artname               0
featured_artists      0
Genre                 0
plays                 0
Artiste Popularity    0
Audio Popularity      0
Music Acousticness    0
Danceability          0
Energy                0
Key                   0
Loudness              0
Mode                  0
Speechiness           0
Instrumentalness      0
Liveness              0
Valence               0
Tempo                 0
Time Signature        0
Track Genre           0
Release Date          0
Explicit              0
Duration              0
ID                    0
id_artists            0
Followers             0
dtype: int64

In [7]:
# Function to parse mixed date formats
def parse_dates(date_str):
  try:
      return pd.to_datetime(date_str, format='%Y-%m-%d')
  except ValueError:
      try:
          return pd.to_datetime(date_str, format='%Y-%m')
      except ValueError:
          return pd.to_datetime(date_str, format='%Y')

# Apply date parsing
data_encoded['Parsed Release Date'] = data_encoded['Release Date'].apply(parse_dates)

# Extracting additional features from 'Release Date'
data_encoded['Release Year'] = data_encoded['Parsed Release Date'].dt.year
data_encoded['Release Month'] = data_encoded['Parsed Release Date'].dt.month

# Dropping the original 'Release Date' and intermediate 'Parsed Release Date' columns
data_encoded.drop(columns=['Release Date', 'Parsed Release Date'], inplace=True)

# Inspect the DataFrame after encoding and feature extraction
print(data_encoded.head())


   usersha1  age      Name                                              Music  \
0     83811   16  Danielle                                       Bank Account   
1     83811   16  Danielle  Mo Money Mo Problems (feat. Mase & Puff Daddy)...   
2     83811   16  Danielle                                       Little Talks   
3     13397   17     Angel                                      Wherever I Go   
4     13397   17     Angel                                    Hands To Myself   

                artname                       featured_artists  plays  \
0             21 Savage                             Birdy, Zoé     11   
1  The Notorious B.I.G.                               LUDMILLA   1091   
2   Of Monsters and Men      Ninho, Snoop Dogg, Russ, Paramore    686   
3           OneRepublic  Keith Urban, DJ Khaled, NIKI, MF DOOM    136   
4          Selena Gomez     SAINt JHN, David Bisbal, will.i.am      1   

   Artiste Popularity  Audio Popularity  Music Acousticness  ...  \
0     

In [8]:
data_encoded.head()

Unnamed: 0,usersha1,age,Name,Music,artname,featured_artists,plays,Artiste Popularity,Audio Popularity,Music Acousticness,...,Genre_Underground Rap,Genre_Unknown,Genre_dnb,Genre_hardstyle,Genre_techhouse,Genre_techno,Genre_trance,Genre_trap,Release Year,Release Month
0,83811,16,Danielle,Bank Account,21 Savage,"Birdy, Zoé",11,9289.820129,22.0306,0.676699,...,0,0,0,0,0,0,0,0,2017,9
1,83811,16,Danielle,Mo Money Mo Problems (feat. Mase & Puff Daddy)...,The Notorious B.I.G.,LUDMILLA,1091,5171.977853,25.7399,0.0128,...,1,0,0,0,0,0,0,0,1997,3
2,83811,16,Danielle,Little Talks,Of Monsters and Men,"Ninho, Snoop Dogg, Russ, Paramore",686,2380.280721,26.66,0.0206,...,0,1,0,0,0,0,0,0,2012,1
3,13397,17,Angel,Wherever I Go,OneRepublic,"Keith Urban, DJ Khaled, NIKI, MF DOOM",136,9926.540057,16.9773,0.0915,...,0,1,0,0,0,0,0,0,2016,12
4,13397,17,Angel,Hands To Myself,Selena Gomez,"SAINt JHN, David Bisbal, will.i.am",1,13905.383,20.068,0.552041,...,0,1,0,0,0,0,0,0,2015,10


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [10]:
# Split the data into train, validation, and test sets
train_data, val_and_test_data = train_test_split(data_cleaned, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(val_and_test_data, test_size=0.5, random_state=42)


In [11]:
# Select user demographic features
user_demographic_features = ['age', 'Education', 'gender', 'country']


In [12]:
# Preprocessing pipeline for demographic features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Education', 'gender', 'country'])
    ])

In [13]:
# Apply the preprocessing pipeline to train, validation, and test data
train_data_demographics = preprocessor.fit_transform(train_data[user_demographic_features])
val_data_demographics = preprocessor.transform(val_data[user_demographic_features])
test_data_demographics = preprocessor.transform(test_data[user_demographic_features])


In [14]:
# Convert to DataFrame for easier handling
train_data_demographics_df = pd.DataFrame(train_data_demographics, columns=preprocessor.get_feature_names_out())
val_data_demographics_df = pd.DataFrame(val_data_demographics, columns=preprocessor.get_feature_names_out())
test_data_demographics_df = pd.DataFrame(test_data_demographics, columns=preprocessor.get_feature_names_out())

Context- aware Content Based Features


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine item features into a single text feature
train_data['combined_features'] = train_data[['Music', 'artname', 'featured_artists', 'Genre']].fillna('').agg(' '.join, axis=1)
val_data['combined_features'] = val_data[['Music', 'artname', 'featured_artists', 'Genre']].fillna('').agg(' '.join, axis=1)
test_data['combined_features'] = test_data[['Music', 'artname', 'featured_artists', 'Genre']].fillna('').agg(' '.join, axis=1)

# Vectorize the combined item features
tfidf_vectorizer = TfidfVectorizer()
train_tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['combined_features'])
val_tfidf_matrix = tfidf_vectorizer.transform(val_data['combined_features'])
test_tfidf_matrix = tfidf_vectorizer.transform(test_data['combined_features'])

# Convert TF-IDF matrices to DataFrames for easier handling
train_tfidf_df = pd.DataFrame(train_tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
val_tfidf_df = pd.DataFrame(val_tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
test_tfidf_df = pd.DataFrame(test_tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Combine TF-IDF features with demographic features
train_combined_features_df = pd.concat([train_tfidf_df, train_data_demographics_df], axis=1)
val_combined_features_df = pd.concat([val_tfidf_df, val_data_demographics_df], axis=1)
test_combined_features_df = pd.concat([test_tfidf_df, test_data_demographics_df], axis=1)


Collaborative Filtering


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# Creating a utility matrix for collaborative filtering
train_data = train_data.drop_duplicates(subset='usersha1', keep='first')

utility_matrix = train_data.pivot(index='usersha1', columns='Music', values='plays').fillna(0)

# Calculating item similarity matrix
item_similarity = cosine_similarity(utility_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, index=utility_matrix.columns, columns=utility_matrix.columns)

def get_similar_items(item_name, n=5):
    similar_items = item_similarity_df[item_name].sort_values(ascending=False)[1:n+1]
    return similar_items

# Example usage
# print(get_similar_items('Bank Account'))


In [17]:
# print(get_similar_items(''))

KeyError: ''

Content Based Filtering


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity for content-based filtering with demographics
cosine_sim_with_demographics = cosine_similarity(train_combined_features_df, train_combined_features_df)
cosine_sim_with_demographics_df = pd.DataFrame(cosine_sim_with_demographics, index=train_combined_features_df.index, columns=train_combined_features_df.index)

def get_content_based_recommendations_with_demographics(user_id, n=5):
    user_data = train_data[train_data['usersha1'] == user_id]
    if user_data.empty:
        return []
    user_idx = user_data.index[0]
    sim_scores = list(enumerate(cosine_sim_with_demographics_df[user_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]
    item_indices = [i[0] for i in sim_scores]
    return train_data.iloc[item_indices]

# Example usage
# print(get_content_based_recommendations_with_demographics('83811'))


In [19]:
def hybrid_recommendation_with_demographics(user_id, item_name, n=5):
    collaborative_recommendations = pd.Series(get_similar_items(item_name, n))
    content_based_recommendations = pd.Series(get_content_based_recommendations_with_demographics(user_id, n))
    hybrid_recommendations = pd.concat([collaborative_recommendations, content_based_recommendations]).drop_duplicates()
    return hybrid_recommendations.head(n)

In [20]:
print(hybrid_recommendation_with_demographics('83811', 'Bank Account'))


Ready or Not    0.0
dtype: float64


  content_based_recommendations = pd.Series(get_content_based_recommendations_with_demographics(user_id, n))


In [21]:
import numpy as np

class LinUCB:
    def __init__(self, alpha, d):
        self.alpha = alpha
        self.d = d
        self.A = np.identity(d)
        self.b = np.zeros(d)
        
    def select_arm(self, x):
        A_inv = np.linalg.inv(self.A)
        theta = np.dot(A_inv, self.b)
        p = np.dot(theta, x) + self.alpha * np.sqrt(np.dot(x, np.dot(A_inv, x)))
        return p

    def update(self, x, reward):
        self.A += np.outer(x, x)
        self.b += reward * x


In [22]:
class RLRecommender:
    def __init__(self, alpha, d):
        self.model = LinUCB(alpha, d)
        
    def recommend(self, user_context, items_context, n=6):
        scores = [self.model.select_arm(np.concatenate([user_context, item])) for item in items_context]
        item_indices = np.argsort(scores)[-n:]
        return item_indices
    
    def update(self, user_context, item_context, reward):
        self.model.update(np.concatenate([user_context, item_context]), reward)


In [23]:
from gradio import Interface
from gradio.components import Slider, Dropdown, Radio, Textbox, Dataframe

def recommend_songs(age, education, gender, country, favorite_artist, user_name):
    # Preprocess user input
    user_data = pd.DataFrame({
        'age': [age],
        'Education': [education],
        'gender': [gender],
        'country': [country]
    })

    user_data_demographics = preprocessor.transform(user_data)
    user_context = user_data_demographics[0]
    
    # Simulate item context for demonstration
    items_context = train_tfidf_df.values[:100]  # Using first 100 items for demonstration
    
    # Get recommendations using RL model
    rl_recommender = RLRecommender(alpha=0.1, d=user_context.shape[0] + items_context.shape[1])
    recommended_indices = rl_recommender.recommend(user_context, items_context)
    recommended_songs = train_data.iloc[recommended_indices]
    
    return recommended_songs[['Music', 'artname', 'Genre', 'plays']]

# Gradio interface
interface = Interface(
    fn=recommend_songs,
    inputs=[
        Slider(minimum=10, maximum=80, step=1, default=25, label="Age"),
        Dropdown(choices=list(train_data['Education'].unique()), label="Education"),
        Radio(choices=['Male', 'Female'], label="Gender"),
        Dropdown(choices=list(train_data['country'].unique()), label="Country"),
        Textbox(label="Favorite Artist"),
        Textbox(label="Username")
        
    ],
    outputs=Dataframe(headers=['Music', 'Artist', 'Genre', 'Plays'], type='pandas', label="Recommended Songs")
)

interface.launch(debug=True)

  Slider(minimum=10, maximum=80, step=1, default=25, label="Age"),


Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


IMPORTANT: You are using gradio version 3.50.0, however version 4.29.0 is available, please upgrade.
--------
Keyboard interruption in main thread... closing server.
