# Assignment 1 #   
** Dataset: http://grouplens.org/datasets/movielens/1m/ **
   
   
### Data Preprocessing ###


In [2]:
import os
with open(os.getcwd()+'/ml-1m/users.dat','r') as f:
    f_users = f.readlines()
    
with open(os.getcwd()+'/ml-1m/movies.dat','r') as f:
    f_movies = f.readlines()
    
with open(os.getcwd()+'/ml-1m/ratings.dat','r') as f:
    f_ratings = f.readlines();

In [3]:
#prepare user dictionary
users = {}
for line in f_users:
    (uid,sex,age,occupation,zipcode) = line.split('::')
    users[uid] = {
        'sex':sex,
        'age':int(age)
    }
    
#prepare movie name dictionary
movies = {}
for line in f_movies:
    (mid,title,genres) = line.split('::')
    movies[mid] = title

#prepsare rating dictionary
critics = {}
for line in f_ratings:
    (uid,mid,rating,timestamp) = line.split('::')
    if uid not in critics:
        critics[uid] = {}
    critics[uid][movies[mid]] = float(rating)

In [4]:
fan_score_list = """1::Toy Story (1995)::Animation|Children's|Comedy::5
2::Jumanji (1995)::Adventure|Children's|Fantasy::4
9::Sudden Death (1995)::Action::2
10::GoldenEye (1995)::Action|Adventure|Thriller::2
13::Balto (1995)::Animation|Children's::1
14::Nixon (1995)::Drama::1
17::Sense and Sensibility (1995)::Drama|Romance::1
23::Assassins (1995)::Thriller::3
47::Seven (Se7en) (1995)::Crime|Thriller::2
356::Forrest Gump (1994)::Comedy|Romance|War::5
3147::Green Mile, The (1999)::Drama|Thriller::5
593::Silence of the Lambs, The (1991)::Drama|Thriller::2
2028::Saving Private Ryan (1998)::Action|Drama|War::5
838::Emma (1996)::Comedy|Drama|Romance::1
1721::Titanic (1997)::Drama|Romance::5
2628::Star Wars: Episode I - The Phantom Menace (1999)::Action|Adventure|Fantasy|Sci-Fi::4
1608::Air Force One (1997)::Action|Thriller::4
165::Die Hard: With a Vengeance (1995)::Action|Thriller::4
589::Terminator 2: Judgment Day (1991)::Action|Sci-Fi|Thriller::2
"""

for line in fan_score_list.splitlines():
    (mid,title,genres,rating) = line.split('::')
    if 'fan' not in critics:
        critics['fan'] = {}
    critics['fan'][title] = float(rating)

In [5]:
fan_nonscore_list = """318::Shawshank Redemption, The (1994)::Drama
527::Schindler's List (1993)::Drama|War
2959::Fight Club (1999)::Drama
393::Street Fighter (1994)::Action
3285::Beach, The (2000)::Adventure|Drama
2571::Matrix, The (1999)::Action|Sci-Fi|Thriller
1270::Back to the Future (1985)::Comedy|Sci-Fi
3578::Gladiator (2000)::Action|Drama
1200::Aliens (1986)::Action|Sci-Fi|Thriller|War
2858::American Beauty (1999)::Comedy|Drama
22::Copycat (1995)::Crime|Drama|Thriller
"""

predicts = {}
for line in fan_nonscore_list.splitlines():
    (mid,title,genres) = line.split('::')
    predicts[title] = 'unknow'

### Common Function Definition ###

In [6]:
from math import sqrt
def sim_distance(prefs, p1, p2):
    '''
    Returns a distance-based similarity score for person1 and person2.
    '''

    # Get the list of shared_items
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
    # If they have no ratings in common, return 0
    if len(si) == 0:
        return 0
    # Add up the squares of all the differences
    sum_of_squares = sum([pow(prefs[p1][item] - prefs[p2][item], 2) for item in prefs[p1] if item in prefs[p2]])
    return 1 / (1 + sqrt(sum_of_squares))

#相似度(pearson)
def sim_pearson(prefs, p1, p2):
    # Get the list of mutually rated items
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
    # If they are no ratings in common, return 0
    if len(si) == 0:
        return 0
    # Sum calculations
    n = len(si)
    # Sums of all the preferences
    sum1 = sum([prefs[p1][it] for it in si])
    sum2 = sum([prefs[p2][it] for it in si])
    # Sums of the squares
    sum1Sq = sum([pow(prefs[p1][it], 2) for it in si])
    sum2Sq = sum([pow(prefs[p2][it], 2) for it in si])
    # Sum of the products
    pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])
    # Calculate r (Pearson score)
    num = pSum - sum1 * sum2 / n
    den = sqrt((sum1Sq - pow(sum1, 2) / n) * (sum2Sq - pow(sum2, 2) / n))
    if den == 0:
        return 0
    r = num / den
    return r

#回傳 K 個最相似的
def topMatches(prefs, person, n=5, similarity=sim_pearson):
    '''
    Returns the best matches for person from the prefs dictionary. 
    Number of results and similarity function are optional params.
    '''
    scores = [(similarity(prefs, person, other), other) for other in prefs
              if other != person]
    scores.sort()
    scores.reverse()
    return scores[0:n]

#回傳 K 個最相似的 user
def topMatchesUser(prefs, person, n=5, min_item=5, similarity=sim_pearson):
    scores = []
    for other in prefs:
        if other == person:
            continue
        same_item = [movie for movie in prefs[person] if movie in prefs[other]]
        if len(same_item) < min_item:
            continue
        scores.append((similarity(prefs, person, other),other))
    scores.sort()
    scores.reverse()
    return scores[0:n]

def topMatchesItem(prefs, movie, n=10, min_item=5, similarity=sim_pearson):
    scores = []
    same_item = []
    for other in prefs:
        if movie == other:
            continue
        same_item = [u for u in prefs[movie] if u in prefs[other]]
        if len(same_item) < min_item:
            continue
        scores.append((similarity(prefs, movie, other),other))
    scores.sort()
    scores.reverse()
    return scores[0:n]
            

#轉置
def transformPrefs(prefs):
    '''
    Transform the recommendations into a mapping where persons are described
    with interest scores for a given title e.g. {title: person} instead of
    {person: title}.
    '''
    result = {}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item, {})
            # Flip item and person
            result[item][person] = prefs[person][item]
    return result


### 1. Movie Score Prediction ###

In [7]:
import time

#找出與清單相似的 item
def calculateAllSimilarItems(prefs, item_list, n=10,similarity=sim_pearson):
    result = {}
    # Invert the preference matrix to be item-centric
    itemPrefs = transformPrefs(prefs)
    for item in itemPrefs:
        # Find the most similar items to this one
        if item in item_list:
            similar_item = topMatches(itemPrefs, item, n)
            result[item] = similar_item
    return result

#calculate the scores of the movies by user-based CF
def calculateMoviesScoreByUserBased(prefs, person, movie_list, similarity=sim_pearson):
    totals = {}
    simSums = {}
    scores = topMatchesUser(prefs, person, 10, 10)
    for sim,other in scores:
        for item in movie_list:
            if item not in prefs[other]:
                continue
            totals.setdefault(item, 0)
            # The final score is calculated by multiplying each item by the
            #   similarity and adding these products together
            totals[item] += prefs[other][item] * sim
            # Sum of similarities
            simSums.setdefault(item, 0)
            simSums[item] += sim  
            
#     for other in prefs:
#         if other == person:
#             continue
#         sim = similarity(prefs, person, other)
#         if sim <= 0.74:
#             continue
#         for item in movie_list:
#             if item not in prefs[other]:
#                 continue
#             totals.setdefault(item, 0)
#             # The final score is calculated by multiplying each item by the
#             #   similarity and adding these products together
#             totals[item] += prefs[other][item] * sim
#             # Sum of similarities
#             simSums.setdefault(item, 0)
#             simSums[item] += sim            
        
    # Create the normalized list
    rankings = [(total / simSums[item], item) for (item, total) in
                totals.items()]
    rankings.sort()
    rankings.reverse()
    return rankings

#calculate the scores of the movies by item-based CF
def calculateMoviesScoreByItemBased(prefs, person, movie_list, similarity=sim_pearson):
    userRatings = prefs[person]
    itemPrefs = transformPrefs(prefs)
    scores = {}
    totalSim = {}
    # Loop over items rated by this user
    for (item, rating) in userRatings.items():
        # Loop over items similar to this one
        for movie in movie_list:
            same_item = [u for u in itemPrefs[movie] if u in itemPrefs[item]]
            if len(same_item) < 10:
                continue
            sim = similarity(itemPrefs, item, movie)
            if sim <= 0:
                continue
            # Weighted sum of rating times similarity
            scores.setdefault(movie, 0)
            scores[movie] += sim * rating
            # Sum of all the similarities
            totalSim.setdefault(movie, 0)
            totalSim[movie] += sim

    # Loop over items rated by this user
#     for (item, rating) in userRatings.items():
#         # Loop over items similar to this one
#         for movie in movie_list:
#             sim = similarity(itemPrefs, item, movie)
#             if sim <= 0:
#                 continue
#             # Weighted sum of rating times similarity
#             scores.setdefault(movie, 0)
#             scores[movie] += sim * rating
#             # Sum of all the similarities
#             totalSim.setdefault(movie, 0)
#             totalSim[movie] += sim            
    # Divide each total score by total weighting to get an average
    rankings = [(score / totalSim[item], item) for (item, score) in
                scores.items()]
    # Return the rankings from highest to lowest
    rankings.sort()
    rankings.reverse()
    return rankings

In [11]:
#User-based CF
start = time.time()
rankings = calculateMoviesScoreByUserBased(critics, 'fan', predicts)
print "user-based CF spent:" , time.time()-start , " seconds"

user-based CF spent: 0.0722999572754  seconds


In [12]:
for score, item in rankings:
    print item, round(score,1)

Shawshank Redemption, The (1994) 4.7
Schindler's List (1993) 4.6
American Beauty (1999) 4.5
Back to the Future (1985) 4.4
Matrix, The (1999) 4.4
Aliens (1986) 4.3
Gladiator (2000) 3.9
Copycat (1995) 3.5
Fight Club (1999) 3.3
Street Fighter (1994) 3.0
Beach, The (2000) 3.0


In [13]:
#Item-based CF
start = time.time()
rankings_itembased = calculateMoviesScoreByItemBased(critics, 'fan', predicts)
print "item-based CF spent:" , time.time()-start , " seconds"

item-based CF spent: 1.11612606049  seconds


In [15]:
for score,item in rankings_itembased:
    print item,round(score,2)

Shawshank Redemption, The (1994) 3.72
Back to the Future (1985) 3.46
Street Fighter (1994) 3.4
Gladiator (2000) 3.27
Beach, The (2000) 3.2
Schindler's List (1993) 3.17
Copycat (1995) 3.16
Matrix, The (1999) 3.12
Aliens (1986) 2.91
American Beauty (1999) 2.77
Fight Club (1999) 2.45


### 2. 45-year-old female Favorite Movie ###

In [17]:
female45 = {}
for user in users:
    if users[user]['sex'] == 'F' and users[user]['age'] == 45 :
        female45[user] = users[user]

In [18]:
female45_rating = {}
for person in critics.keys():
    if person not in female45:
        continue
    for item in critics[person]:
        if item not in female45_rating:
            female45_rating[item] = {
                'summary':0,
                'count':0
            }
        female45_rating[item]['summary']+=critics[person][item]
        female45_rating[item]['count']+=1

for movie in list(female45_rating):
    item = female45_rating[movie]
    count = item['count']
    if count < 50:
        del female45_rating[movie]
        continue
    female45_rating[movie]['average'] = item['summary']/item['count']


a = female45_rating.items()
a.sort(key=lambda x: (x[1]['count'], x[1]['average']),reverse=True)

b = a[0:10]
b.sort(key=lambda x: (x[1]['average']), reverse=True)
for r in a[0:10]:
    print r[0] + ',' + str(r[1]['count']) + ',' + str(r[1]['average'])
    
print '----final result----'
for r in b:
    print r[0] + ',' + str(r[1]['count']) + ',' + str(r[1]['average'])

American Beauty (1999),86,4.12790697674
Shakespeare in Love (1998),81,4.23456790123
Schindler's List (1993),71,4.70422535211
Silence of the Lambs, The (1991),71,3.94366197183
Forrest Gump (1994),68,4.08823529412
Groundhog Day (1993),67,3.82089552239
Star Wars: Episode IV - A New Hope (1977),65,4.41538461538
Fargo (1996),65,4.13846153846
E.T. the Extra-Terrestrial (1982),65,4.10769230769
Babe (1995),63,4.04761904762
----final result----
Schindler's List (1993),71,4.70422535211
Star Wars: Episode IV - A New Hope (1977),65,4.41538461538
Shakespeare in Love (1998),81,4.23456790123
Fargo (1996),65,4.13846153846
American Beauty (1999),86,4.12790697674
E.T. the Extra-Terrestrial (1982),65,4.10769230769
Forrest Gump (1994),68,4.08823529412
Babe (1995),63,4.04761904762
Silence of the Lambs, The (1991),71,3.94366197183
Groundhog Day (1993),67,3.82089552239


In [19]:
#import pandas as pd
#users2 = {}
#for line in f_users:
    #(uid,sex,age,occupation,zipcode) = line.split('::')
    #users2[uid] = {
        #'uid':uid,
        #'sex':sex,
        #'age':age
    #}
#df_users = pd.DataFrame(users2)
#df_users =df_users.T
#df_users = df_users[(df_users.age == '45') & (df_users.sex == 'F' )]

#movies2 = {}
#for line in f_movies:
    #(mid,title,genres) = line.split('::')
    #movies2[mid] = {
        #'mid':mid,
        #'title':title
    #}    
#df_movies = pd.DataFrame(movies2).T

#critics2 = {}
#for line in f_ratings:
    #(uid,mid,rating,timestamp) = line.split('::')
    #critics2[uid,mid] = {
            #'uid':uid,
            #'mid':mid,
            #'rating':int(rating)
    #}
#df_ratings = pd.DataFrame(critics2).T

In [28]:
#df_user_rating = pd.merge(df_users,df_ratings,on='uid',how='left')
#df_user_rating_movie = pd.merge(df_user_rating,df_movies,on='mid',how='left')
#df_user_rating_movie['rating'] =df_user_rating_movie['rating'].astype(int)

In [37]:
#df_avg = df_user_rating_movie.drop('age',axis=1) \
#.drop('sex',axis=1).drop('uid',axis=1) \
#.groupby(['mid','title']).agg(['count','sum','mean']) \
#.sort_values(by=[('rating','count'),('rating','mean')],ascending=False)

#print(df_avg.head(10).sort_values(by=('rating','mean'),ascending=False))

                                               rating               
                                                count  sum      mean
mid  title                                                          
527  Schindler's List (1993)                       71  334  4.704225
260  Star Wars: Episode IV - A New Hope (1977)     65  287  4.415385
2396 Shakespeare in Love (1998)                    81  343  4.234568
608  Fargo (1996)                                  65  269  4.138462
2858 American Beauty (1999)                        86  355  4.127907
1097 E.T. the Extra-Terrestrial (1982)             65  267  4.107692
356  Forrest Gump (1994)                           68  278  4.088235
34   Babe (1995)                                   63  255  4.047619
593  Silence of the Lambs, The (1991)              71  280  3.943662
1265 Groundhog Day (1993)                          67  256  3.820896


### 3. Similarity Movies by Item-based CF ###

In [21]:
itemPrefs = transformPrefs(critics)
topMatchesItem(itemPrefs, 'Toy Story (1995)', 10, 30)

[(0.6303863801210986, 'Toy Story 2 (1999)'),
 (0.5746957711326909, 'Hear My Song (1991)'),
 (0.5661033405974422, 'Woman on Top (2000)'),
 (0.559059839179094, 'Polish Wedding (1998)'),
 (0.5472774683663487, 'Star Kid (1997)'),
 (0.5432567988390745, 'Crimson Pirate, The (1952)'),
 (0.5066264033306871, 'House Arrest (1996)'),
 (0.5065204054284228, 'Andre (1994)'),
 (0.4998487736618072, 'Little Big League (1994)'),
 (0.491433124188546, 'Incredible Journey, The (1963)')]

In [22]:
critics['fan']

{'Air Force One (1997)': 4.0,
 'Assassins (1995)': 3.0,
 'Balto (1995)': 1.0,
 'Die Hard: With a Vengeance (1995)': 4.0,
 'Emma (1996)': 1.0,
 'Forrest Gump (1994)': 5.0,
 'GoldenEye (1995)': 2.0,
 'Green Mile, The (1999)': 5.0,
 'Jumanji (1995)': 4.0,
 'Nixon (1995)': 1.0,
 'Saving Private Ryan (1998)': 5.0,
 'Sense and Sensibility (1995)': 1.0,
 'Seven (Se7en) (1995)': 2.0,
 'Silence of the Lambs, The (1991)': 2.0,
 'Star Wars: Episode I - The Phantom Menace (1999)': 4.0,
 'Sudden Death (1995)': 2.0,
 'Terminator 2: Judgment Day (1991)': 2.0,
 'Titanic (1997)': 5.0,
 'Toy Story (1995)': 5.0}

In [120]:
# Top10 Distance Similarity
#def topMatchesdistance(prefs, movies, n=10, similarity=sim_distance):
    #'''
    #Returns the best matches for person from the prefs dictionary.
    #Number of results and similarity function are optional params.
    #'''
    #scores = [(similarity(prefs, movies, other), other) for other in prefs
              #if other != movies]
    #scores.sort()
    #scores.reverse()

    #return scores[0:n]

# Top10 Pearson Similarity
#def topMatchespearson(prefs, movies, n=10, similarity=sim_pearson):
    #'''
    #Returns the best matches for person from the prefs dictionary.
    #Number of results and similarity function are optional params.
    #'''
    #scores = [(similarity(prefs, movies, other), other) for other in prefs
              #if other != movies]
    #scores.sort()
    #scores.reverse()

    #return scores[0:n]

#def calculateSimilarItemspearson(prefs, item_search, n=9999, similarity=sim_pearson):
    #result = {}
    ## Invert the preference matrix to be item-centric
    #itemPrefs = transformPrefs(prefs)
    #for item in itemPrefs:
        ## Find the most similar items to this one
        #if item in item_search:
            #similar_item = topMatchespearson(itemPrefs, item, n=n, similarity=sim_pearson)
            #result[item] = similar_item
    #return result


#def calculateSimilarItemsdistance(prefs, item_search, n=9999, similarity=sim_distance):
    #result = {}
    #itemresult = {}
    ## Invert the preference matrix to be item-centric
    #itemPrefs = transformPrefs(prefs)
    #for key,value in itemPrefs.items():
        #temp_count = len(list(filter(bool, value)))
        #if temp_count >= 10:
            #itemresult.update(itemPrefs)
    #for item in itemresult:
        ## Find the most similar items to this one
        #if item in item_search:
            #similar_item = topMatchesdistance(itemresult, item, n=n, similarity=sim_distance)
            #result[item] = similar_item
    #return result


In [121]:
#movie_name = 'Toy Story (1995)'

In [122]:
#sim_list_distance = calculateSimilarItemsdistance(critics, {movie_name: ''})

In [123]:
#for r in sim_list_distance[movie_name][0:10]:
    #print r

(1.0, 'Yankee Zulu (1994)')
(1.0, 'With Friends Like These... (1998)')
(1.0, 'To Have, or Not (1995)')
(1.0, 'Ten Benny (1997)')
(1.0, 'Sticky Fingers of Time, The (1997)')
(1.0, 'Sonic Outlaws (1995)')
(1.0, 'Small Wonders (1996)')
(1.0, 'Show, The (1995)')
(1.0, 'Rosie (1998)')
(1.0, 'Promise, The (Versprechen, Das) (1994)')


In [124]:
#sim_list_pearson = calculateSimilarItemspearson(critics, {movie_name: ''})

In [125]:
#for r in sim_list_pearson[movie_name][0:10]:
    #print r

(1.0000000000000033, 'Black Sunday (La Maschera Del Demonio) (1960)')
(1.0, 'Talk of Angels (1998)')
(1.0, 'Sunset Park (1996)')
(1.0, 'Stag (1997)')
(1.0, 'Small Wonders (1996)')
(1.0, 'Six Ways to Sunday (1997)')
(1.0, 'Simon Sez (1999)')
(1.0, 'Secret Agent, The (1996)')
(1.0, 'Penitentiary II (1982)')
(1.0, 'Paris, France (1993)')
