### Notebook: Exploring various Recommendation Approaches

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
df = pd.read_csv("../data/transformed_user_track_combined_data.csv")
df.head()

Unnamed: 0,track_id,user_id,playcount,name,artist,tags,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIRLYL128F42539D1,user_1,1,Nothing From Nothing,Billy Preston,"soul, funk, piano, 70s, oldies",RnB,0.759,0.834,9,-5.206,0,0.0276,0.585,0.055,0.331,0.939,102.792,4
1,TRFUPBA128F934F7E1,user_1,1,Paper Gangsta,Lady Gaga,"electronic, pop, female_vocalists, dance, pian...",Pop,0.823,0.53,10,-9.344,0,0.0579,0.804,0.782,0.105,0.8,100.002,4
2,TRTUCUY128F92E1D24,user_1,1,Stacked Actors,Foo Fighters,"rock, alternative, alternative_rock, hard_rock...",Rock,0.502,0.934,7,-5.145,1,0.05,0.00203,0.00102,0.234,0.67,134.898,4
3,TRHDDQG12903CB53EE,user_1,1,Heaven's Gonna Burn Your Eyes,Thievery Corporation,"electronic, female_vocalists, ambient, chillou...",Electronic,0.516,0.337,9,-9.812,1,0.0286,0.635,0.521,0.0637,0.323,152.411,4
4,TRUTULC128F4293712,user_1,1,Gimme Stitches,Foo Fighters,"rock, alternative_rock, hard_rock, grunge",Rock,0.538,0.958,2,-4.423,1,0.068,0.00212,0.00215,0.241,0.541,111.842,4


In [3]:
sdf = df.sample(10000)
sdf.head()

Unnamed: 0,track_id,user_id,playcount,name,artist,tags,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
2110623,TRUTULC128F4293712,user_323022,2,Gimme Stitches,Foo Fighters,"rock, alternative_rock, hard_rock, grunge",Rock,0.538,0.958,2,-4.423,1,0.068,0.00212,0.00215,0.241,0.541,111.842,4
2843660,TRUVVBT128E07824BB,user_436008,1,In the Warm Room,Kate Bush,"rock, alternative, female_vocalists, 70s",Rock,0.484,0.0338,2,-18.252,0,0.0397,0.986,7.3e-05,0.0712,0.211,79.239,1
2041201,TRPCEKB128F92C73A2,user_312202,1,Shempi,Ratatat,"electronic, indie, instrumental, american, 00s...",Electronic,0.61,0.824,10,-5.229,1,0.0431,0.00515,0.919,0.232,0.509,123.46,4
1645049,TRRZVKM128F14A94E3,user_250824,2,Cemetry Gates,The Smiths,"rock, alternative, indie, pop, alternative_roc...",Rock,0.537,0.655,7,-12.356,1,0.0323,0.0978,0.0,0.128,0.667,105.232,4
3484744,TRHSMXC12903CCD313,user_534048,6,Sinister Kid,The Black Keys,"rock, indie, blues, american, psychedelic_rock...",Rock,0.764,0.829,9,-7.117,1,0.0719,0.154,0.12,0.379,0.933,139.608,4


### Recommendation Class

In [4]:
import numpy as np
import pandas

# Class for a popularity-based recommender system
class popularity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.popularity_recommendations = None
        
    # Build the popularity-based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

        # Count the number of unique users for each song to determine its popularity score
        train_data_grouped = train_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index()
        train_data_grouped.rename(columns = {'user_id': 'score'}, inplace=True)
    
        # Sort the songs based on their popularity score
        train_data_sort = train_data_grouped.sort_values(['score', self.item_id], ascending=[0,1])
    
        # Assign a rank to each song based on its popularity score
        train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
        
        # Keep the top 10 most popular songs
        self.popularity_recommendations = train_data_sort.head(10)

    # Recommend popular songs to a user
    def recommend(self, user_id):    
        user_recommendations = self.popularity_recommendations.copy()
        
        # Add a column indicating which user the recommendations are for
        user_recommendations['user_id'] = user_id
    
        # Move the user_id column to the front
        cols = user_recommendations.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_recommendations = user_recommendations[cols]
        
        return user_recommendations

In [5]:
sdf.head()

Unnamed: 0,track_id,user_id,playcount,name,artist,tags,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
2110623,TRUTULC128F4293712,user_323022,2,Gimme Stitches,Foo Fighters,"rock, alternative_rock, hard_rock, grunge",Rock,0.538,0.958,2,-4.423,1,0.068,0.00212,0.00215,0.241,0.541,111.842,4
2843660,TRUVVBT128E07824BB,user_436008,1,In the Warm Room,Kate Bush,"rock, alternative, female_vocalists, 70s",Rock,0.484,0.0338,2,-18.252,0,0.0397,0.986,7.3e-05,0.0712,0.211,79.239,1
2041201,TRPCEKB128F92C73A2,user_312202,1,Shempi,Ratatat,"electronic, indie, instrumental, american, 00s...",Electronic,0.61,0.824,10,-5.229,1,0.0431,0.00515,0.919,0.232,0.509,123.46,4
1645049,TRRZVKM128F14A94E3,user_250824,2,Cemetry Gates,The Smiths,"rock, alternative, indie, pop, alternative_roc...",Rock,0.537,0.655,7,-12.356,1,0.0323,0.0978,0.0,0.128,0.667,105.232,4
3484744,TRHSMXC12903CCD313,user_534048,6,Sinister Kid,The Black Keys,"rock, indie, blues, american, psychedelic_rock...",Rock,0.764,0.829,9,-7.117,1,0.0719,0.154,0.12,0.379,0.933,139.608,4


In [6]:
sdf['song'] = sdf['name'].map(str) + " - " + sdf['artist']

### Now let group the songs based on the listen count & percentage to show the most popular songs

In [7]:
grouped_songs = sdf.groupby(['song']).agg({'playcount': 'count'}).reset_index()
grouped_sum = grouped_songs['playcount'].sum()
grouped_songs['percentage']  = grouped_songs['playcount'].div(grouped_sum)*100
grouped_songs.sort_values(['playcount', 'song'], ascending = [0,1]).head()

Unnamed: 0,song,playcount,percentage
2828,Revelry - Kings of Leon,123,1.23
150,Alejandro - Lady Gaga,70,0.7
523,Bring Me To Life - Katherine Jenkins,45,0.45
1488,Hey There Delilah - Plain White T's,45,0.45
1453,Heartbreak Warfare - John Mayer,44,0.44


In [8]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(sdf, test_size = 0.20, random_state=0)
train_data.head(5)

Unnamed: 0,track_id,user_id,playcount,name,artist,tags,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,song
1406510,TRKUMGM128F424F0DB,user_214211,2,Before Tomorrow Comes,Alter Bridge,"rock, alternative_rock, hard_rock",Rock,0.345,0.975,3,-3.243,1,0.132,8.3e-05,1e-05,0.217,0.264,160.093,4,Before Tomorrow Comes - Alter Bridge
6178126,TRGONOF128F4281C3C,user_946782,1,The Man Who Would Be King,The Libertines,"rock, indie, indie_rock, british, post_punk, b...",Rock,0.471,0.655,4,-5.857,0,0.0306,0.414,0.000356,0.164,0.542,117.654,4,The Man Who Would Be King - The Libertines
5714327,TRPUGUW128F426BF6F,user_875334,1,He Doesn't Know Why,Fleet Foxes,"rock, alternative, indie, folk, indie_rock, am...",Rock,0.496,0.558,1,-5.535,1,0.0302,0.213,0.000302,0.344,0.436,93.989,4,He Doesn't Know Why - Fleet Foxes
4091176,TRCKWGF12903CD2DCD,user_626267,1,Never Let You Go,Third Eye Blind,"rock, alternative, pop, alternative_rock, 90s,...",Pop,0.729,0.94,4,-4.933,1,0.0382,0.0845,0.000262,0.0963,0.966,113.804,4,Never Let You Go - Third Eye Blind
2894296,TRUAMQX128F428B49D,user_443900,1,Supply and Demand,The Hives,"rock, indie, punk, punk_rock, swedish",Rock,0.527,0.787,9,-6.574,0,0.152,0.373,0.0,0.224,0.714,149.606,4,Supply and Demand - The Hives


### Popularity-Based Recommendation

#### Main Idea
- Recommend items that are popular across all users.

#### Key Tasks
- **Count User IDs**: Calculate the count of user_ids for each unique song to determine the recommendation score.
- **Sort Songs**: Rank songs based on their popularity scores.
- **Top-N List**: Retrieve the top 10 most popular songs.

#### Summary
- Focuses on the most listened-to songs, providing a simple yet effective recommendation system that doesn't account for individual user preferences.

In [48]:
# Calculate the popularity score for each song
popularity_df = sdf.groupby('track_id')['user_id'].count().reset_index()
popularity_df.columns = ['track_id', 'popularity_score']
popularity_df = popularity_df.sort_values(by='popularity_score', ascending=False)

# Get top 10 recommendations
top_10_recommendations = popularity_df.head(10)['track_id'].unique().tolist()

print("Top 10 Popular Songs:\n")

for i in top_10_recommendations:
    print(sdf[sdf.track_id == i].song.values[0])

Top 10 Popular Songs:

Revelry - Kings of Leon
Alejandro - Lady Gaga
Bring Me To Life - Katherine Jenkins
Hey There Delilah - Plain White T's
Heartbreak Warfare - John Mayer
Float On - Modest Mouse
Imma Be - Black Eyed Peas
In My Place - Coldplay
Party In The U.S.A. - The Barden Bellas
U Smile - Justin Bieber



### User-Based Collaborative Filtering

#### Collaborative Filtering Model

**Main Idea**:
- Use the preferences of similar users to recommend items.

**Output Types**:
- **Numerical Prediction**: Indicates the degree to which the current user will like or dislike an item.
- **Top-N List**: A list of recommended items excluding those already purchased by the user.

#### User-Based Algorithms

**Key Concept**:
- Compute predictions based on the similarity between users.

**Cosine Similarity**:
- Measures similarity between two users' rating vectors.
- Values range from 0 to 1, with values near 1 indicating strong similarity.

#### Summary
- Leverages user similarities to suggest items, personalized but can suffer from scalability issues with a large number of users.


In [63]:
from sklearn.metrics.pairwise import cosine_similarity

# Create User-Item interaction matrix
user_item_matrix = sdf.pivot(index='user_id', columns='track_id', values='playcount').fillna(0)

# Calculate user similarity
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

def recommend_songs_user_based(user_id, user_item_matrix, user_similarity_df, num_recommendations=10):
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:]  # Exclude self
    similar_users_songs = user_item_matrix.loc[similar_users].sum(axis=0)
    similar_users_songs = similar_users_songs[similar_users_songs > 0].sort_values(ascending=False)
    
    user_songs = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index.tolist()
    recommendations = similar_users_songs.drop(user_songs).head(num_recommendations).index.tolist()
    return recommendations

In [68]:
user_ids = sdf.user_id.unique().tolist()

user_id = user_ids[3]
recommended_tracks = recommend_songs_user_based(user_id, user_item_matrix, user_similarity_df)

recommended_songs = sdf[sdf.track_id.isin(recommended_tracks)].song.unique().tolist()
print("Recommended Songs (User-Based):\n")

for i in recommended_songs:
    print(i)

Recommended Songs (User-Based):

Hey There Delilah - Plain White T's
Alejandro - Lady Gaga
Heartbreak Warfare - John Mayer
In My Place - Coldplay
Revelry - Kings of Leon
Bring Me To Life - Katherine Jenkins
The Big Gundown - The Prodigy
Ironside - Nine Black Alps
Undercovers On - Rival Schools
Alcohaulin' Ass - Hellyeah


### Item-Based Collaborative Filtering

#### Collaborative Filtering Model

**Main Idea**:
- Utilize past behaviors or opinions of an existing user community.
- Predict items the current user will likely enjoy.

**Output Types**:
- **Numerical Prediction**: Indicates the degree to which the current user will like or dislike a certain item.
- **Top-N List**: A list of recommended items excluding those already purchased by the user.

#### Item-Based Algorithms

**Key Concept**:
- Compute predictions based on the similarity between items rather than between users.

**Cosine Similarity**:
- Standard metric for item-based recommendations.
- Measures similarity between two n-dimensional vectors based on the angle between them.
- Values range from 0 to 1, with values near 1 indicating strong similarity.

**Adjusted Cosine Measure**:
- Accounts for differences in users' average rating behavior.
- Values range from -1 to +1, similar to the Pearson measure.
- Provides more accurate similarity assessments by subtracting user average from ratings.

#### Summary
- Uses item similarities for recommendations, often providing more stable and scalable recommendations compared to user-based filtering.



In [69]:
from sklearn.metrics.pairwise import cosine_similarity


### Create a User-Item Interaction Matrix
user_item_matrix = sdf.pivot(index='user_id', columns='track_id', values='playcount').fillna(0)

### Calculate Item Similarity
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)


def recommend_songs(user_id, user_item_matrix, item_similarity_df, dataset_df, num_recommendations=10):
    user_songs = user_item_matrix.loc[user_id]
    user_songs = user_songs[user_songs > 0].index.tolist()
    
    scores = item_similarity_df[user_songs].sum(axis=1)
    scores = scores.drop(user_songs)  # Exclude already listened songs
    
    top_songs = scores.sort_values(ascending=False).head( ).index.tolist()

    
    return top_songs


user_id = user_ids[2]
recommended_tracks = recommend_songs(user_id, user_item_matrix, item_similarity_df, sdf)

recommended_songs = sdf[sdf.track_id.isin(recommended_tracks)].song.unique().tolist()
print("Recommended songs:\n")

for i in recommended_songs:
    print(i)

Recommended songs:

Many Shades of Black - The Raconteurs
People Are Crazy - Billy Currington
Through The Backyards - Au Revoir Simone
Auburn and Ivory - Beach House
Weather Storm - Craig Armstrong


In [71]:
user_id = user_ids[16]
recommended_tracks = recommend_songs(user_id, user_item_matrix, item_similarity_df, sdf)

recommended_songs = sdf[sdf.track_id.isin(recommended_tracks)].song.unique().tolist()
print("Recommended songs:\n")

for i in recommended_songs:
    print(i)

Recommended songs:

Watch The World - Box Car Racer
Many Shades of Black - The Raconteurs
All Play Dead - Nevermore
Auburn and Ivory - Beach House
Weather Storm - Craig Armstrong


### Summary of Approaches

1. **Popularity-Based Recommendation**:
   - Recommends most popular items.
   - Simple but lacks personalization.

2. **User-Based Collaborative Filtering**:
   - Recommends items based on similar users.
   - Personalized but can struggle with scalability.

3. **Item-Based Collaborative Filtering**:
   - Recommends items based on item similarities.
   - More stable and scalable compared to user-based methods.