<a href="https://colab.research.google.com/github/jasproudis/AUEB-MScDataScience-Projects/blob/main/Recommender_Systems/IMDB_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
from dataclasses import dataclass
from torch import Tensor
@dataclass
class movie:
    ryear:float
    title:str
    runtime:int
    genres:str
    overview:str
    rating:float
    meta_score:float
    directors:str
    stars:list
    gross:float



In [29]:
#load SBERT


from sentence_transformers import SentenceTransformer,util

sbert = SentenceTransformer('all-MiniLM-L6-v2')# load the pretrained model.


In [30]:
import csv

#load movies from the input file
def load_movies(input_file:str):


    title_index={}# title to index
    movies=[]
    with open(input_file) as f:
        overviews=[]

        for row in csv.DictReader(f): # for each movie

            #make a new movie object
            new_mv=movie(int(row['Released_Year']) if row['Released_Year'].isnumeric() else None,
                         row['Series_Title'],
                         int(row['Runtime'][:row['Runtime'].find(' ')]),
                         set([x.strip() for x in row['Genre'].split(',')]),
                         row['Overview'],
                         float(row['IMDB_Rating']),
                         float(row['Meta_score'][:row['Meta_score'].find(' ')]) if row['Meta_score']!='' else None,
                         set([x.strip() for x in row['Director'].split(',')]),
                         set([row[x] for x in ['Star1','Star2','Star3','Star4']]),
                         float(row['Gross'].replace(',','')) if row['Gross']!='' else None)

            #store the overview separately
            overviews.append(row['Overview'])

            #update the index
            title_index[row['Series_Title']]=len(movies)
            movies.append(new_mv)

    #encode all overviews
    embedded=sbert.encode(overviews,convert_to_tensor=True)

    #Compute cosine-similarities
    sim_matrix = util.cos_sim(embedded, embedded)

    return movies,sim_matrix,title_index


In [31]:
movies,overview_sim_matrix,title_index=load_movies('imdb_top_1000.csv')

lst=[m.ryear for m in movies if m.ryear!=None]

srt=sorted(lst)

mx=srt[-1]
mn=srt[0]

mn,mx

(1920, 2020)

In [32]:
overview_sim_matrix[1,10]

tensor(0.0851)

In [33]:
import numpy as np

#movie-to-movie similarity
def sim_v2(title1:str,
           title2:str,
           movies:list,
           weights:dict,
           overview_sim_matrix:Tensor,
           title_index:dict)->float:

    mid1,mid2=title_index[title1],title_index[title2]# get the movie ids (idnexes)
    m1=movies[mid1] # get the movie objects
    m2=movies[mid2]

    scores=dict() # stores a the score for each factor from the weights dict

    #star jacard
    scores['star']=len(m1.stars.intersection(m2.stars))/len(m1.stars.union(m2.stars))

    #director jaccard
    scores['director']=len(m1.directors.intersection(m2.directors))/len(m1.directors.union(m2.directors))

    #genre jaccard
    scores['genre']=len(m1.genres.intersection(m2.genres))/len(m1.genres.union(m2.genres))

    # release year diff
    try:
        scores['ryear']=1- abs(m1.ryear-m2.ryear)/(mx-mn)
    except:
        scores['ryear']=0

    #cosine sim for overviews
    scores['overview']=overview_sim_matrix[mid1,mid2].numpy()

    #normalized candidate rating
    scores['rating']=m1.rating/10

    #create the sim dict
    factors={x:round(scores[x]*weights[x],2) for x in scores}

    #sort factors by sim
    sorted_factors=[factor for factor in sorted(factors.items(), key=lambda x:x[1],reverse=True) if factor[1]>0]

    #return overall score and explanations
    return round(np.sum(list(factors.values())),2),sorted_factors




In [34]:
sim_v2('Toy Story',
       'Toy Story',
       movies,
       {'ryear':1, 'genre':1, 'director':1, 'rating':1, 'star':1, 'overview': 1},
       overview_sim_matrix,
       title_index)

(5.83,
 [('star', 1.0),
  ('director', 1.0),
  ('genre', 1.0),
  ('ryear', 1.0),
  ('overview', 1.0),
  ('rating', 0.83)])

In [35]:
def recommend(input_title:str,
              k:int,
              movies:dict,
              weights:dict,
              overview_sim_matrix:Tensor,
              title_index:dict
              )->list:

    results={} # recommendations

    for candidate in movies: # for each candidate

        #get the similarity and the explanation
        my_sim,my_exp=sim_v2(candidate.title,input_title,movies, weights,overview_sim_matrix,title_index)

        #remember
        results[candidate.title]=(my_sim,my_exp)

    #store, slice, return
    return sorted(results.items(),key=lambda x:x[1][0],reverse=True)[:k]

In [36]:

weights={'ryear':1, 'genre':1, 'director':1, 'rating':1, 'star':1, 'overview': 1}

recommend('Toy Story',50,movies,weights,overview_sim_matrix,title_index)

[('Toy Story',
  (5.83,
   [('star', 1.0),
    ('director', 1.0),
    ('genre', 1.0),
    ('ryear', 1.0),
    ('overview', 1.0),
    ('rating', 0.83)])),
 ('Toy Story 2',
  (4.45,
   [('director', 1.0),
    ('genre', 1.0),
    ('ryear', 0.96),
    ('rating', 0.79),
    ('overview', 0.37),
    ('star', 0.33)])),
 ('Toy Story 3',
  (3.35,
   [('genre', 1.0),
    ('ryear', 0.85),
    ('rating', 0.82),
    ('overview', 0.35),
    ('star', 0.33)])),
 ('Toy Story 4',
  (3.27,
   [('genre', 1.0),
    ('rating', 0.78),
    ('ryear', 0.76),
    ('overview', 0.4),
    ('star', 0.33)])),
 ('Monsters, Inc.',
  (2.99,
   [('genre', 1.0), ('ryear', 0.94), ('rating', 0.81), ('overview', 0.24)])),
 ('Aladdin',
  (2.95,
   [('genre', 1.0), ('ryear', 0.97), ('rating', 0.8), ('overview', 0.18)])),
 ('Shrek',
  (2.93,
   [('genre', 1.0), ('ryear', 0.94), ('rating', 0.78), ('overview', 0.21)])),
 ('Gake no ue no Ponyo',
  (2.92,
   [('genre', 1.0), ('ryear', 0.87), ('rating', 0.77), ('overview', 0.28)])),


<h1>Simulation<h1>

In [37]:
@dataclass
class User:
    seed_movies:list # latent movies that the user likes
    likes:list # known movies that the user has liked
    dislikes:list # known movies that the user has disliked
    weights:dict # user preferences
    like_threshold:float # similarity threshold for liking a movie


In [38]:
from random import random,sample,shuffle

#create fake users
def generate_users(movies:list, # list of movie objects
             overview_sim_matrix, # movie-to-movie sim matrix
             title_index, # index that maps each title to a position in the movies list
             user_num:int=10,  # number of users to generaate
             seed_movie_num:int=5, # number of random seed movies to choose
             factors=['ryear','genre','director','rating','star','overview'], # factors to consider
             std_multiplier:float=1.5 # used to tune the 'Like' threshold
            ):

    users=[]# list of fake users

    for i in range(user_num):  # for each fake user to create
        print(i)

        weights={} # user preferences for each factor

        for factor in factors: # for each factor
            weights[factor]=round(random(),2) # sample a random preference value (weight)

        seed_movies=sample(movies,seed_movie_num)# sample 5 seed movies

        '''
        Compute the "like" threshold for this user
        If a movie has an above-threshold similarity with (at least)
        one of the seed movies, then we assume that the user will like it.
        The threshold is defined to be equal to the average sim of all movies
        with the seed movies, +1 stdev
        '''

        sim_scores=[]

        s=0
        for seed_movie in seed_movies: # for each seed movie of this user

            for candidate in movies: # for each other movie

                #compute seed-candidate sim
                val=sim_v2(candidate.title,
                           seed_movie.title,
                           movies,
                           weights,
                           overview_sim_matrix,
                           title_index)[0]

                sim_scores.append(val) # store

        like_threshold=np.mean(sim_scores)+ std_multiplier*np.std(sim_scores) # threshold is mean + 1 std dev

        # remember the user
        users.append(User(seed_movies,
                          [], # liked movies
                          [], # disliked movies
                          weights, # preferences
                          round(like_threshold,2)) # like threshold
                    )


    return users



In [39]:
users=generate_users(movies,overview_sim_matrix,title_index)

0
1
2
3
4
5
6
7
8
9


In [59]:
users[6].weights

{'ryear': 0.11,
 'genre': 0.02,
 'director': 0.91,
 'rating': 0.46,
 'star': 0.6,
 'overview': 0.11}

In [57]:
def random_simulate(
            users:list, # fake users
             movies:list, # movie list
            title_index:dict, # maps titles to indices in the movies list
             overview_sim_matrix:Tensor, # movie to movie sim matrix
            recnum_per_user:int=50, # number of recommendations to make
             factors=['ryear','genre','director','rating','star','overview'] # factors to consider
            ):

    for user in users: # for each fake user

      #initialize likes and dislikes for this user
      user.likes=[]
      user.dislikes=[]

      recommended=set() # remember titles of recommended movies

      # Number of recommended movies per user
      cnt = 0
      # flag for recommended movie picking method
      flag = 0
      # Number of fails in similar movies after find one that the user likes
      fails = 0
      # Dictionary of existing neighbors that the user likes
      liked_movies = {}
      disliked_movies = {}
      user.disliked_threshold = user.like_threshold / 2

      while cnt < recnum_per_user + 1: # for each recommendation to make

        if flag == 0:
          rec_movie=None # movie to be recommended

          # pick a random movie that has not been recommended before
          while rec_movie==None or rec_movie.title in recommended:
              rec_movie=sample(movies,1)[0]

        elif flag == 1:
          if fails < 5:
            val_max = 0
            #pick movie with biggest similarity and does not exist in existing neightbors and is not in previous recomendation
            for m in movies:
                if m.title in recommended:
                  continue
                else:
                  for liked in liked_movies:
                    val = sim_v2(m.title,
                                liked,
                                movies,
                                {'ryear':1, 'genre':1, 'director':1, 'rating':1, 'star':1, 'overview': 1},
                                overview_sim_matrix,
                                title_index)[0]
                    if val > val_max:
                      rec_movie = m
                      val_max = val

          else:
            fails = 0
            val_min = 100
            tries = 0
            # pick a random movie that has not been recommended before and recomend a neirest neightbor with the biggest similarity
            while rec_movie==None or rec_movie.title in recommended:
              # I try six times because we have six features
              if tries < 7:
                #find a disimilar movie randomly the check similarity with diliked movies
                rec_movie=sample(movies,1)[0]
                for disliked in disliked_movies:
                  if sim_v2(rec_movie.title,
                            disliked,
                            movies,
                            {'ryear':1, 'genre':1, 'director':1, 'rating':1, 'star':1, 'overview': 1},
                            overview_sim_matrix,
                            title_index)[0] > user.disliked_threshold:
                    rec_movie = None
                    tries += 1
                    break
              else:
                tries = 0
                rec_movie=sample(movies,1)[0]


        #remember the recommendation
        recommended.add(rec_movie.title)
        cnt += 1

        found_similar_seed=False # becomes true if the random movie is similar to a seed

        for sm in user.seed_movies: # for each seed movie for this user

              # compute the sim between the random movie and the seed movie
              val=sim_v2(rec_movie.title,
                            sm.title,
                            movies,
                            user.weights,
                            overview_sim_matrix,
                            title_index)[0]

              # if the sim is over the like threshold of this user
              if val>user.like_threshold:
                  found_similar_seed=True
                  break

        if found_similar_seed: # similar seed found, the user will like this movie
            user.likes.append(rec_movie)
            fails = 0
            flag = 1
            liked_movies[rec_movie.title] = val
            print('YES',rec_movie.title,len(user.likes))
        else:
            user.dislikes.append(rec_movie)
            fails += 1
            disliked_movies[rec_movie.title] = val
            print('NO',rec_movie.title,len(user.dislikes))

      print('\n\nTotal Likes out of 50:', len(user.likes),'\n\n-----------------------\n')



In [58]:
random_simulate(users,movies,title_index, overview_sim_matrix)

NO The Illusionist 1
NO Schindler's List 2
YES Die Hard 1
NO Die Hard: With a Vengeance 3
NO Predator 4
YES Taken 2
YES Serbuan maut 3
YES Serbuan maut 2: Berandal 4
YES Baby 5
YES Special Chabbis 6
YES A Wednesday 7
YES M.S. Dhoni: The Untold Story 8
NO Bhaag Milkha Bhaag 5
YES The Departed 9
YES Casino 10
YES Goodfellas 11
NO The Irishman 6
YES Taxi Driver 12
NO The Wolf of Wall Street 7
NO Raging Bull 8
NO The King of Comedy 9
NO After Hours 10
NO Shutter Island 11
YES La Grande Illusion 13
NO La règle du jeu 12
YES Once Upon a Time in America 14
YES Pulp Fiction 15
YES Kill Bill: Vol. 1 16
YES Kill Bill: Vol. 2 17
NO The Hateful Eight 13
YES Reservoir Dogs 18
NO Django Unchained 14
YES The Godfather: Part II 19
YES The Godfather 20
NO The Godfather: Part III 15
NO Apocalypse Now 16
YES Heat 21
NO The Insider 17
NO The Conversation 18
YES Inglourious Basterds 22
YES Ang-ma-reul bo-at-da 23
NO La haine 19
YES Vikram Vedha 24
NO Boyz n the Hood 20
YES Lucky Number Slevin 25
YES Man on