<a href="https://colab.research.google.com/github/helpingstar/DL-study-179/blob/sun/Recommend_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recommend system with MovieLens dataset
Using 'MovieLens Latest Datasets_small'


### Import module

In [35]:
import pandas as pd
import numpy as np
import math
import operator

## Load data



In [36]:
# load movie data
movies=[]
with open('/content/drive/MyDrive/movies.csv','r',encoding='UTF-8') as f:
    for line in f.readlines():
        movies.append(line.strip().split(","))

pd.DataFrame(np.array(movies[1:10]),columns=movies[0])

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action


In [37]:
# load rating data
ratings=[]
with open('/content/drive/MyDrive/ratings.csv','r',encoding='UTF-8') as f:
    for line in f.readlines():
        ratings.append(line.strip().split(","))
        
pd.DataFrame(np.array(ratings[1:10]),columns=ratings[0])

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041


## Data preprocessing

### Dictionary function

In [38]:
# change rating data to dictionary
def ratings_dictionary(ratings):
    r_dict = {}
    for i in ratings[1:]:
        if i[0] not in r_dict.keys():
            r_dict[i[0]]={i[1]:float(i[2])}
        else:
            r_dict[i[0]].setdefault(i[1],float(i[2]))
    return r_dict

### Cosine similarity function

In [39]:
# calculate cosine similarity with data
def cosine_similarity(A,B): 
    dot_p = np.dot(A,B)
    A_norms = math.sqrt(sum([i**2 for i in A]))
    B_norms = math.sqrt(sum([i**2 for i in B]))
    AB_norms = A_norms * B_norms

    # similar close to 1
    return dot_p / AB_norms

## Modeling

### User based filtering

In [40]:
all_m=[i[0] for i in movies[1:]]

def user_based_filtering(rating_dict, person, similarity=cosine_similarity, k=3):
    person_m = set(rating_dict[person].keys())
    similar_score={}
    
    for other_person in rating_dict.keys():
        both_m = person_m.intersection(set(rating_dict[other_person].keys()))
        if len(both_m) >= 1: 
            person_m_rating = [rating_dict[person][i] for i in both_m]
            other_person_m_rating = [rating_dict[other_person][i] for i in both_m]

            score= cosine_similarity(person_m_rating, other_person_m_rating)

            similar_score[other_person]=score

            
    
    neighborhood=sorted(similar_score.items(), key=operator.itemgetter(1), reverse=True)
    print("Similarity : \n\n",neighborhood[:50])
    
    
    people_for_recommendation = [i[0] for i in neighborhood] 
    no_watch_m = set(all_m) - person_m 
    recommendation_of_movies = {} 
    a=1
    for movie in no_watch_m:
        r=[]
        s=[]
        for person in people_for_recommendation: 
            if movie in rating_dict[person].keys():
                r.append(rating_dict[person][movie]*similar_score[person])
                s.append(similar_score[person])
            else:
                continue
                
        if (sum(r) == 0) | (sum(s)== 0) : 
            continue
        elif len(r) <= k: 
            recommendation_of_movies[movie] = sum(r)/sum(s) 
        else: 
            recommendation_of_movies[movie] = sum(r[:k])/sum(s[:k]) 
        
    
    return sorted(recommendation_of_movies.items(), key=operator.itemgetter(1), reverse=True)[:1000]

r_dict = ratings_dictionary(ratings)

user_based_recommendation_1000 = user_based_filtering(r_dict, '1')

Similarity : 

 [('1', 1.0), ('77', 1.0), ('85', 1.0), ('184', 1.0), ('245', 1.0), ('253', 1.0), ('291', 1.0), ('315', 1.0), ('358', 1.0), ('383', 1.0), ('388', 1.0), ('12', 0.9999999999999999), ('2', 0.9999999999999998), ('511', 0.999193447324659), ('278', 0.9988130559615213), ('550', 0.9986560262721613), ('366', 0.9986331420538472), ('472', 0.9986178293325098), ('459', 0.9981149841863163), ('114', 0.9980597354377347), ('65', 0.9978250350798125), ('259', 0.9970544855015816), ('49', 0.9969629303492418), ('258', 0.9965457582448797), ('180', 0.9962866000353438), ('523', 0.9962049198956219), ('538', 0.9959100033104784), ('53', 0.9958705948858224), ('9', 0.9957385837170334), ('300', 0.9955079780599657), ('398', 0.9951003121457859), ('401', 0.9949526491606236), ('360', 0.9949366763261819), ('189', 0.994908573123846), ('530', 0.9948934062566152), ('154', 0.9948057871850503), ('319', 0.9942824919679822), ('13', 0.9942815218442164), ('582', 0.9940251572134323), ('25', 0.9939759036144576), ('51

In [41]:
# Top 1000 to recommend
pd.DataFrame(user_based_recommendation_1000)

Unnamed: 0,0,1
0,6983,5.000000
1,4813,5.000000
2,4116,5.000000
3,44943,5.000000
4,5088,5.000000
...,...,...
995,1633,4.338962
996,4117,4.338311
997,222,4.337836
998,322,4.337720


### Item based filtering

In [42]:
def item_based_filtering(rating_dict, person, similarity=cosine_similarity, k=3):

    tmp = [list(r_dict[i].keys()) for i in list(r_dict.keys())]
    no_rating_m = list(set([j for i in tmp for j in i])) 
    rating_m = [i for i in list(rating_dict[person].keys())] 
 
    c_s={}
    
    for i in no_rating_m:
        c_s[i]={}
        for j in rating_m:
            p_i=[]
            p_j=[]
            for p in list(rating_dict.keys()):
                if j in rating_dict[p].keys() and i in rating_dict[p].keys():
                    p_i.append(rating_dict[p][i])
                    p_j.append(rating_dict[p][j])
            if len(p_i)>0:
                c=cosine_similarity(p_i, p_j)
                c_s[i].setdefault(j,c)

    
    result = {}
    
    for i in list(c_s.keys()):
        sum_sim_rating=[]
        sum_sim=[]
        k_i_j = sorted(c_s[i].items(), key=operator.itemgetter(1), reverse=True)[:k]
        
        for j in k_i_j:
            sum_sim.append(j[1])
            sum_sim_rating.append(j[1]*rating_dict[person][j[0]])
            
        if len(sum_sim) > 0:
            result[i]=sum(sum_sim_rating)/sum(sum_sim)
            
    return sorted(result.items(), key=operator.itemgetter(1), reverse=True)

            
        
item_based_recommendation_1000 = item_based_filtering(r_dict, '1')

In [43]:
# Top 1000 to recommend
pd.DataFrame(item_based_recommendation_1000)

Unnamed: 0,0,1
0,1199,5.0
1,1200,5.0
2,1277,5.0
3,56587,5.0
4,104875,5.0
...,...,...
9709,85334,2.0
9710,87028,2.0
9711,26095,2.0
9712,89386,2.0
