In [1]:
import findspark
findspark.init()
import pyspark
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, Rating
from collections import defaultdict
from surprise import SVD, Dataset, Reader
from ipywidgets import widgets, interact_manual, Layout, interact
from IPython.display import display
from urllib.request import urlopen
import matplotlib.pyplot as plt

# Define function 
def loadAnimeNames():
    animeNames = {}
    with open("AnimeList.csv", encoding='ascii', errors="ignore") as f:
        for line in f:
            fields = line.split(',')
            animeNames[int(fields[0])] = [fields[1],fields[2]]
    return animeNames

def loadUserNames():
    usernames = {}
    with open("animelists_als2.csv", encoding='ascii', errors="ignore") as f:
        for line in f:
            fields = line.split(',')
            usernames[int(fields[0])] = fields[1]
    return usernames

def parseline(line):
    fields = line.split(',')
    userid = fields[0]
    username = fields[1]
    animeid = fields[2]
    score = fields[3]
    return (userid, animeid, score)

# pyspark set-up
conf = SparkConf().setMaster("local[*]").setAppName("AnimeRecommendationsALS")
sc = SparkContext(conf = conf)
sc.setCheckpointDir('checkpoint')

nameDict = loadAnimeNames()
usernameDict = loadUserNames()


# Build the recommendation model using Alternating Least Squares
print("Training ALS model...")
lines = sc.textFile("animelists_als2.csv")
parsedlines = lines.map(parseline)
ratings = parsedlines.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
rank = 5
numIterations = 20
model = ALS.train(ratings, rank, numIterations)

# Build the recommendation model using Neural Network
print("Training Neural Network model...")
nn_predict = pd.read_csv("NeuralNets_pred.csv")

def get_nn_top_n(predictions, n=5):
    top_n = defaultdict(list)
    for i in tqdm(range(len(predictions))):
        top_n[predictions["username"].iloc[i]].append((predictions["anime_id"].iloc[i], predictions["predicted_score"].iloc[i]))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = [i[0] for i in user_ratings[:n]]
    return top_n

NN_recommend_dict = get_nn_top_n(nn_predict)

nn_recommendationdict = {}
for nn_username in tqdm(NN_recommend_dict.keys()):
    for userid, username in usernameDict.items():
        if nn_username == username:
            nn_recommendationdict[userid] = NN_recommend_dict[username]    

# Build the recommendation model using SVD
print("Training SVD model...")
svd_df = pd.read_csv('animelists_svd2.csv')
reader = Reader(rating_scale=(1, 10))
svd_data = Dataset.load_from_df(svd_df, reader)
#svd_train_data, testset = train_test_split(svd_data, test_size=.20)
svd_data = svd_data.build_full_trainset()
svd_test_data = svd_data.build_testset()
algo = SVD(n_factors=5)
algo.fit(svd_data)
predictions = algo.test(svd_test_data)

def get_svd_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in tqdm(predictions):
        top_n[uid].append((iid, est, true_r))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:5]
        
    return top_n

top_n = get_svd_top_n(predictions, n=10)
svd_recommendationdict = defaultdict(list)
for uid, user_ratings in top_n.items():
    svd_recommendationdict[uid].append([iid for (iid,_,_) in user_ratings])

# Ensemble
print("Training Ensemble Model...")
svd_prediction = pd.DataFrame(predictions).iloc[:,:3]
svd_prediction['username'] = pd.Series([usernameDict[i] for i in svd_prediction['uid']])
als_set = ratings.map(lambda t: (t[0], t[1]))
als_pred = model.predictAll(als_set).collect()
als_prediction = pd.DataFrame(als_pred)
als_prediction['username'] = pd.Series([usernameDict[i] for i in als_prediction['user']])
nn_als_pred = nn_predict.merge(als_prediction, left_on = ['username','anime_id'], right_on = ['username','product'], how='inner')
merged_pred = nn_als_pred.merge(svd_prediction, left_on=['username', 'anime_id'], right_on = ['username', 'iid'], how ='inner')
merged_pred['mean_rating'] = (merged_pred['predicted_score']+merged_pred['rating']+merged_pred['r_ui'])/3

def get_es_top_n(predictions, n=5):
    top_n = defaultdict(list)
    for i in tqdm(range(len(predictions))):
        top_n[predictions["username"].iloc[i]].append((predictions["anime_id"].iloc[i], predictions["mean_rating"].iloc[i]))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = [i[0] for i in user_ratings[:n]]
    return top_n

es_recommend_dict = get_es_top_n(merged_pred)
es_recommendationdict = {}
for es_username in tqdm(es_recommend_dict.keys()):
    for userid, username in usernameDict.items():
        if es_username == username:
            es_recommendationdict[userid] = es_recommend_dict[username]    

# Clustering
sci_fi_junkie = [30484, 820, 34599, 11577, 13125]
lively_youth = [5114, 28977, 11061, 9969, 1]
hot_blood_fighter = [11061, 211, 136, 813, 9130]
no_preference = [37405, 5114, 32281, 28977, 30484]


# Print out the recommendations based on model and print out the poster of the recommendations    
def RecommendationSystem():

    userID = int(text.value)
    print("\nTop 5 recommendations for ",usernameDict[userID],":\n")
    
    if output_model.value == 'ALS':
        recommendations = model.recommendProducts(userID, 5)
        for recommendation in recommendations:
            print (nameDict[int(recommendation[1])][0])

        image_list = []
        for user, item, rating in model.recommendProducts(userID, 5):
            image_list.append(urlopen(nameDict[int(item)][1]))

        item_list = []
        for user, item, rating in model.recommendProducts(userID, 5):
            item_list.append((nameDict[int(item)][0]))
    
    if output_model.value == 'NN':
        itemid_list = nn_recommendationdict[userID]
        
        item_list=[]
        for itemid in itemid_list:
            item_list.append((nameDict[int(itemid)][0]))
        
        for recommendation in item_list:
            print (recommendation)
            
        image_list = []
        for itemid in itemid_list:
            image_list.append(urlopen(nameDict[int(itemid)][1]))       
    
    if output_model.value == 'SVD':
        itemid_list = svd_recommendationdict[userID][0]
        
        item_list=[]
        for itemid in itemid_list:
            item_list.append((nameDict[int(itemid)][0]))
        
        for recommendation in item_list:
            print (recommendation)
            
        image_list = []
        for itemid in itemid_list:
            image_list.append(urlopen(nameDict[int(itemid)][1]))
    
    if output_model.value == 'Ensemble':
        itemid_list = es_recommendationdict[userID]
        
        item_list=[]
        for itemid in itemid_list:
            item_list.append((nameDict[int(itemid)][0]))
        
        for recommendation in item_list:
            print (recommendation)
            
        image_list = []
        for itemid in itemid_list:
            image_list.append(urlopen(nameDict[int(itemid)][1]))  
    
    final_list = list(zip(item_list,image_list))

    fig, (ax0,ax1,ax2,ax3,ax4) = plt.subplots(1,5,figsize=(30,30))
    for i, (item,img) in enumerate(final_list):
        f = plt.imread(img,format='jpg')
        eval('ax{}'.format(i)).imshow(f)
        eval('ax{}'.format(i)).set_title(item)
    plt.show()


def ClusteringSystem():

    print("\nTop 5 recommendations for:",output_model_c.value, "\n")
    
    if output_model_c.value == 'Sci-Fi Junkies':
        c_itemid_list = sci_fi_junkie
        
        c_item_list=[]
        for itemid in c_itemid_list:
            c_item_list.append((nameDict[int(itemid)][0]))
        
        for recommendation in c_item_list:
            print (recommendation)
            
        c_image_list = []
        for itemid in c_itemid_list:
            c_image_list.append(urlopen(nameDict[int(itemid)][1]))       
    
    if output_model_c.value == 'Lively Youths':
        c_itemid_list = lively_youth
        
        c_item_list=[]
        for itemid in c_itemid_list:
            c_item_list.append((nameDict[int(itemid)][0]))
        
        for recommendation in c_item_list:
            print (recommendation)
            
        c_image_list = []
        for itemid in c_itemid_list:
            c_image_list.append(urlopen(nameDict[int(itemid)][1]))
    
    if output_model_c.value == 'Hot Blood Fighters':
        c_itemid_list = hot_blood_fighter
        
        c_item_list=[]
        for itemid in c_itemid_list:
            c_item_list.append((nameDict[int(itemid)][0]))
        
        for recommendation in c_item_list:
            print (recommendation)
            
        c_image_list = []
        for itemid in c_itemid_list:
            c_image_list.append(urlopen(nameDict[int(itemid)][1]))
            
    if output_model_c.value == 'No Preference':
        c_itemid_list = no_preference
        
        c_item_list=[]
        for itemid in c_itemid_list:
            c_item_list.append((nameDict[int(itemid)][0]))
        
        for recommendation in c_item_list:
            print (recommendation)
            
        c_image_list = []
        for itemid in c_itemid_list:
            c_image_list.append(urlopen(nameDict[int(itemid)][1]))  
    
    c_final_list = list(zip(c_item_list,c_image_list))

    fig, (ax0,ax1,ax2,ax3,ax4) = plt.subplots(1,5,figsize=(30,30))
    for i, (item,img) in enumerate(c_final_list):
        f = plt.imread(img,format='jpg')
        eval('ax{}'.format(i)).imshow(f)
        eval('ax{}'.format(i)).set_title(item)
    plt.show()


print("DONE!")

Training ALS model...
Training Neural Network model...


HBox(children=(IntProgress(value=0, max=1048575), HTML(value='')))




HBox(children=(IntProgress(value=0, max=98018), HTML(value='')))


Training SVD model...


HBox(children=(IntProgress(value=0, max=1048655), HTML(value='')))


Training Ensemble Model...


HBox(children=(IntProgress(value=0, max=34533), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2367), HTML(value='')))


DONE!


In [12]:
def f1(Clustering):
    output_model_c.value = str(Clustering)

print("Choose the character that fit you the most!")
output_model_c = widgets.Text()
interact(f1, Clustering=['Sci-Fi Junkies','Lively Youths','Hot Blood Fighters','No Preference'])
interact_manual.opts['manual_name'] = 'Run'
bt = interact_manual(ClusteringSystem)

Choose the character that fit you the most!


interactive(children=(Dropdown(description='Clustering', options=('Sci-Fi Junkies', 'Lively Youths', 'Hot Bloo…

interactive(children=(Button(description='Run', style=ButtonStyle()), Output()), _dom_classes=('widget-interac…

In [10]:
def f(Model):
    output_model.value = str(Model)

print("Choose the model you want to use")
output_model = widgets.Text()
interact(f, Model=['ALS','SVD','NN','Ensemble'])
text=widgets.Text()
display("Enter user id:",text)
interact_manual.opts['manual_name'] = 'Run'
bt = interact_manual(RecommendationSystem)

Choose the model you want to use


interactive(children=(Dropdown(description='Model', options=('ALS', 'SVD', 'NN', 'Ensemble'), value='ALS'), Ou…

'Enter user id:'

Text(value='')

interactive(children=(Button(description='Run', style=ButtonStyle()), Output()), _dom_classes=('widget-interac…

In [None]:
# 107016 kirti
# 107017 Anurag
# 107018 Helena
# 107019 Jason

# 1287

In [4]:
svd_df['user_id'].value_counts()[:20]

1961    5383
2314    4268
1930    4249
2341    4032
1323    3571
4       3366
1466    3043
1287    2970
105     2922
1767    2828
467     2816
542     2531
1405    2458
1825    2445
2357    2441
1573    2259
832     2225
2052    2088
447     2062
2325    2055
Name: user_id, dtype: int64

In [34]:
# def loadAnimeNames():
#     animeNames = {}
#     with open("AnimeList.csv", encoding='ascii', errors="ignore") as f:
#         for line in f:
#             fields = line.split(',')
#             animeNames[int(fields[0])] = [fields[1],fields[2]]
#     return animeNames
# nameDict = loadAnimeNames()

In [25]:
sc.stop()