# Home task

1. Replicate Simple recommender implementation
2. (optional) Replicate the content based recommender implementation

## First subtask

In [17]:
import pandas as pd
import numpy as np

### Load data
[link to the 'metadata'](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset)

In [9]:
metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

### Calculate mean of vote average column

In [12]:
C = metadata['vote_average'].mean()
print(C)

5.618207215133889


### Calculate the minimum number of votes required to be in the chart, m


In [13]:
m = metadata['vote_count'].quantile(0.90)
print(m)

160.0


### Filter out all qualified movies into a new DataFrame

In [14]:
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
q_movies.shape

(4555, 24)

In [15]:
metadata.shape

(45466, 24)

### Function that computes the weighted rating of each movie


In [6]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

### Define a new feature `score` and calculate its value with `weighted_rating()`


In [7]:
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [8]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


## Second subtask

### Load data

In [19]:
from scipy.io import loadmat
import pandas as pd
mat= loadmat('ex8_movies.mat')

def get_data():
    mat= loadmat('ex8_movies.mat') # returns dict - just select proper key to get necessary data
    Y= pd.DataFrame( mat['Y']) # value  vertical (axis 0):  movies 1682. Every row is for one movie , horizontal (axis 1): Users 943, every column is for one user  
    R= pd.DataFrame( mat['R']) #  flag of reviewed or not  vertical (axis 0):  movies 1682. Every row is for one movie , horizontal (axis 1): Users 943, every column is for one user  

    # convert to float to avoid issues with uint type  
    Y= Y.astype(float).values 
    R= R.astype(float).values

    return Y,R

Y,R= get_data()

### Collaborative Filtering Cost Function

In [20]:
def J (Y, R, X,Theta, lambd): 
    '''
    params: 1d vector  of X and Theta 
    :return expression for cost function 
    '''
    
    assert(X.shape[1]==Theta.shape[0])
    h= X @ Theta 


    try: 
        assert(h.shape ==Y.shape)
    except:
        print ('h.shape {} !=Y.shape {}'.format(h.shape, Y.shape) )

       
    J= 1/2 * np.sum(((h-Y)*R) **2) + lambd/2 * np.sum(X**2) + lambd/2 * np.sum(Theta**2)

    try: 
        assert(len(J.shape)== 0)
    except:
        print ('J is not raw number. J.shape = ', J.shape)
    
    return J

### Collaborative Filtering Cost Function Gradient

In [21]:
def J_derivative(Y,R,X, Theta, num_movies, num_users, num_features, lambd): 
   
    cost_matr= (X @ Theta - Y) * R #  n_movies * n_users 

    X_grad= cost_matr @ Theta.T 
    Theta_grad= (cost_matr.T @ X).T
    
   
    try: 
        assert(X_grad.shape ==X.shape)
        assert(Theta_grad.shape ==Theta.shape)
    except:
        print ('Check gradient calculus')

    # Regularization part :          
    X_grad  += lambd * X
    Theta_grad += lambd * Theta

    
    return X_grad, Theta_grad

### Gradient descent (`fit`)

In [22]:
def fit(Y,R, num_features= 10,  alpha=0.0001, lambd= .01, eps= .1,  max_iter=1000,step=100, verbose=0):    
    num_movies,num_users =Y.shape
    
    
    if verbose: 
        print ('Running gradient descent with alpha= {}, lambda= {}, eps= {}, max_iter= {}'.format(
            alpha, lambd, eps, max_iter))
        

    #     X= params[:num_movies*num_features].reshape(num_movies,num_features)
    #     Theta = params[num_movies*num_features:].reshape(num_features,num_users)

    np.random.seed(2019)
    X = np.random.randn(num_movies, num_features)
    Theta = np.random.randn( num_features, num_users)
    
    J_hist=[-1] # used for keeping J values. Init with -1 to avoid 0 at first iter
    continue_iter = True # flag to continue next iter (grad desc step)
    iter_number =0 # used for limit by max_iter

    while continue_iter:            
        # Do step of gradient descent    
        X_grad, Theta_grad =  J_derivative(Y,R,X, Theta, num_movies, num_users, num_features, lambd)
        X= X- alpha  *X_grad        
        Theta= Theta- alpha  *Theta_grad

        # keep history of J values
        J_hist.append(J(Y, R, X,Theta, lambd))
        # check criteria of exit (finish grad desc)
        if iter_number> max_iter: # if limit succeeded
            continue_iter = False
            print ('iter_number> max_iter')
        elif np.abs(J_hist[iter_number-1] - J_hist[iter_number])< eps: # if accuracy is succeeded
            continue_iter = False
            print ('J_hist[iter_number]={}'.format(J_hist[iter_number]))
        iter_number += 1
        
        if verbose and iter_number%step ==0:
            print ('{}: {}'.format(iter_number, J_hist[iter_number-1]))
    
    return X,Theta, J_hist

In [23]:
X,Theta, J_hist = fit(Y,R, alpha=0.001, lambd= 1,max_iter= 2000, verbose=1)

Running gradient descent with alpha= 0.001, lambda= 1, eps= 0.1, max_iter= 2000
100: 46624.80715776309
200: 53775.9915025695
300: 43733.53486206593
400: 40831.75291750681
500: 39035.82278190235
600: 37814.29631650733
700: 36925.665742625155
800: 36251.94119779436
900: 35726.60793968351
1000: 35309.36836244691
1100: 34973.28542812511
1200: 34698.7895953531
1300: 34471.40237327725
1400: 34280.73254673338
1500: 34119.553794967636
1600: 33982.71680525661
1700: 33866.18531536784
1800: 33766.47660299341
1900: 33680.49797333301
2000: 33605.57713377488
iter_number> max_iter


### Predict missed values of training data

In [24]:
pred = X@ Theta
pred

array([[3.63067147, 3.88666946, 2.40576999, ..., 4.54011846, 4.36029223,
        3.93326403],
       [2.80011248, 3.29577016, 2.53739175, ..., 3.17689721, 4.0113373 ,
        3.04035238],
       [3.06520486, 2.81648457, 3.93388444, ..., 2.72525932, 3.53337632,
        2.75231426],
       ...,
       [0.86845254, 0.72470642, 0.01049199, ..., 0.67518173, 0.55738831,
        0.8042484 ],
       [2.20769107, 2.10829064, 1.23490905, ..., 1.90791656, 2.12861293,
        1.78059828],
       [2.45855657, 2.27719474, 1.49320411, ..., 2.37037439, 1.95018702,
        1.16119517]])

### Read movies names

In [26]:
fn ='movie_ids.txt'

doc = []
with open(fn, 'r', encoding='utf-8', errors='ignore') as file:
    for line in file:
        doc.append(' '.join(line.rstrip().split(' ')[1:]))
df_movie_names = pd.DataFrame(doc)

  
df_movie_names.head(10)

Unnamed: 0,0
0,Toy Story (1995)
1,GoldenEye (1995)
2,Four Rooms (1995)
3,Get Shorty (1995)
4,Copycat (1995)
5,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6,Twelve Monkeys (1995)
7,Babe (1995)
8,Dead Man Walking (1995)
9,Richard III (1995)


### Enter ratings for a new user

In [30]:
def build_my_raitings():
    my_ratings = np.zeros((1682, 1))
    # % Check the file movie_idx.txt for id of each movie in our dataset
    # % For example, Toy Story (1995) has ID 1, so to rate it "4", you can set

    my_ratings[2] = 2
    my_ratings[77] = 3
    my_ratings[88] = 3
    my_ratings[16]= 5
    my_ratings[190] = 4
    my_ratings[90]= 5
    my_ratings[79]= 4
    my_ratings[111] = 2
    my_ratings[182] = 4
    my_ratings[287] = 5
    my_ratings[321]= 3
    return my_ratings

my_ratings= build_my_raitings()

print ('\n\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print('Rated {} for {}\n'.format(int(my_ratings[i][0]),df_movie_names.iloc[i].values[0]))



New user ratings:

Rated 2 for Four Rooms (1995)

Rated 5 for From Dusk Till Dawn (1996)

Rated 3 for Free Willy (1993)

Rated 4 for Hot Shots! Part Deux (1993)

Rated 3 for Blade Runner (1982)

Rated 5 for Nightmare Before Christmas, The (1993)

Rated 2 for Flipper (1996)

Rated 4 for Alien (1979)

Rated 4 for Amadeus (1984)

Rated 5 for Scream (1996)

Rated 3 for Murder at 1600 (1997)



### Add new user ratings to the data matrix

In [31]:
def add_my_ratings(Y,R,my_ratings):
    Y = np.c_[my_ratings,Y]
    R = np.c_[my_ratings !=0 , R]
    return Y, R 
        
# Y,R= get_data()    
my_ratings= build_my_raitings()
Y, R = add_my_ratings(Y,R,my_ratings)

In [32]:
X,Theta, J_hist= fit(Y , R, alpha=0.001, lambd= 1,max_iter= 2000, eps= .1, step = 100, verbose=1)

Running gradient descent with alpha= 0.001, lambda= 1, eps= 0.1, max_iter= 2000
100: 46367.9207133743
200: 41622.487448398264
300: 44337.34132830296
400: 40809.24791144825
500: 38885.063001546536
600: 37610.437071782515
700: 36706.24049014253
800: 36050.87176891243
900: 35560.64334056086
1000: 35179.2535968218
1100: 34870.84954982152
1200: 34613.90756816537
1300: 34395.38801608748
1400: 34206.94205810437
1500: 34042.8642031096
1600: 33899.02872052486
1700: 33772.28872692909
1800: 33660.10969197265
1900: 33560.35832542043
2000: 33471.20642109826
iter_number> max_iter


### Predict

In [33]:
pred= X @ Theta
my_pred = pred[:,0]
print(my_pred.shape)

top_pred= np.argsort(my_pred)[::-1]
print('\nTop recommendations for you:\n')
for i in range(100):
    j = top_pred[i]
    print('Predicting rating {:.2} for movie {} (# {})\n'.format(my_pred[j], df_movie_names.iloc[j],j))

(1682,)

Top recommendations for you:

Predicting rating 7.5 for movie 0    Star Wars (1977)
Name: 49, dtype: object (# 49)

Predicting rating 5.5 for movie 0    Empire Strikes Back, The (1980)
Name: 171, dtype: object (# 171)

Predicting rating 5.1 for movie 0    Return of the Jedi (1983)
Name: 180, dtype: object (# 180)

Predicting rating 5.1 for movie 0    Crash (1996)
Name: 324, dtype: object (# 324)

Predicting rating 5.1 for movie 0    Fifth Element, The (1997)
Name: 249, dtype: object (# 249)

Predicting rating 5.1 for movie 0    Titanic (1997)
Name: 312, dtype: object (# 312)

Predicting rating 5.0 for movie 0    Raiders of the Lost Ark (1981)
Name: 173, dtype: object (# 173)

Predicting rating 4.9 for movie 0    Scream (1996)
Name: 287, dtype: object (# 287)

Predicting rating 4.9 for movie 0    Army of Darkness (1993)
Name: 183, dtype: object (# 183)

Predicting rating 4.9 for movie 0    Evil Dead II (1987)
Name: 200, dtype: object (# 200)

Predicting rating 4.8 for movie 0  

Predicting rating 4.0 for movie 0    Man in the Iron Mask, The (1998)
Name: 1482, dtype: object (# 1482)

Predicting rating 4.0 for movie 0    Patton (1970)
Name: 204, dtype: object (# 204)

Predicting rating 4.0 for movie 0    Romy and Michele's High School Reunion (1997)
Name: 1013, dtype: object (# 1013)

Predicting rating 4.0 for movie 0    Aladdin (1992)
Name: 94, dtype: object (# 94)

