# Downloading Rating Data

## Navigating through anonymous web data

In [1]:
import urllib.request
import os.path

filename = "anonymous-msweb.data"
if not os.path.exists("anonymous-msweb.data"):
    url = "https://github.com/amirkrifa/ms-web-dataset/\
raw/master/anonymous-msweb.data"
    urllib.request.urlretrieve(url, filename)

### Parsing the data file

In [2]:
import codecs
import collections

# Open the file.
file = codecs.open(filename, 'r')

# Setup for attributes.
attribute = collections.namedtuple(
    'page', ['id', 'description', 'url'])
attributes = {}

# Setup for users
current_user_id = None
current_user_ids = []
user_visits = {}

# Setup for vroots
page_visits = {}

# Process the data one line at a time and place
# each record in the appropriate storage unit.
for line in file:
    chunks = line.split(',')
    entry_type = chunks[0]
    
    if entry_type == 'A':
        type, id, ignored, description, url = chunks
        attributes[int(id)] = attribute(
            id=int(id), description=description, url=url)
        
    if entry_type == 'C':
        if not current_user_id == None:
            user_visits[current_user_id] = set(
                current_user_ids)
            current_user_ids = []
        current_user_id = int(chunks[2])
        
    if entry_type == 'V':
        page_id = int(chunks[1])
        current_user_ids.append(page_id)
        page_visits.setdefault(page_id, [])
        page_visits[page_id].append(current_user_id)
        
# Display the totals
print('Total Number of Attributes: ', 
      len(attributes.keys()))
print('Total Number of Users: ', len(user_visits.keys()))
print('Total Number of VRoots: ', len(page_visits.keys()))

Total Number of Attributes:  294
Total Number of Users:  32710
Total Number of VRoots:  285


### Viewing the attributes

In [3]:
for k, v in attributes.items():
    print("{:4} {:30.30} {:12}".format(
        v.id, v.description, v.url))

1287 "International AutoRoute"      "/autoroute"

1288 "library"                      "/library"
 
1289 "Master Chef Product Informati "/masterchef"

1297 "Central America"              "/centroam"

1215 "For Developers Only Info"     "/developer"

1279 "Multimedia Golf"              "/msgolf"
  
1239 "Microsoft Consulting"         "/msconsult"

1282 "home"                         "/home"
    
1251 "Reference Support"            "/referencesupport"

1121 "Microsoft Magazine"           "/magazine"

1083 "MS Access Support"            "/msaccesssupport"

1145 "Visual Fox Pro Support"       "/vfoxprosupport"

1276 "Visual Test Support"          "/vtestsupport"

1200 "Benelux Region"               "/benelux"
 
1259 "controls"                     "/controls"

1155 "Sidewalk"                     "/sidewalk"

1092 "Visual FoxPro"                "/vfoxpro"
 
1004 "Microsoft.com Search"         "/search"
  
1057 "MS PowerPoint News"           "/powerpoint"

1140 "Netherlands (Holland)"        "

### Obtaining statistics

In [4]:
nbr_visits = list(map(len, user_visits.values()))
average_visits = sum(nbr_visits) / len(nbr_visits)
one_visit = sum(x == 1 for x in nbr_visits)

print("Number of user visits: ", sum(nbr_visits))
print("Average number of visits: ", average_visits)
print("Users with just one visit: ", one_visit)

Number of user visits:  98653
Average number of visits:  3.0159889941913787
Users with just one visit:  9994


## Encountering the limits of rating data

### Obtaining the data

In [5]:
# You must have previously downloaded and extracted
# the file.
import pandas as pd

ratings = pd.read_csv("ml-20m/ratings.csv")
movies = pd.read_csv("ml-20m/movies.csv")

movie_data = pd.merge(ratings, movies, on="movieId")
print(movie_data.head())

   userId  movieId  rating   timestamp           title  \
0       1        2     3.5  1112486027  Jumanji (1995)   
1       5        2     3.0   851527569  Jumanji (1995)   
2      13        2     3.0   849082742  Jumanji (1995)   
3      29        2     3.0   835562174  Jumanji (1995)   
4      34        2     3.0   846509384  Jumanji (1995)   

                       genres  
0  Adventure|Children|Fantasy  
1  Adventure|Children|Fantasy  
2  Adventure|Children|Fantasy  
3  Adventure|Children|Fantasy  
4  Adventure|Children|Fantasy  


In [6]:
print(movie_data.groupby('title')['rating'].mean().head())

title
"Great Performances" Cats (1998)                               2.748387
#chicagoGirl: The Social Network Takes on a Dictator (2013)    3.666667
$ (Dollars) (1971)                                             2.833333
$5 a Day (2008)                                                2.871795
$9.99 (2008)                                                   3.009091
Name: rating, dtype: float64


### Massaging the data

In [7]:
reduced_movie = movie_data.loc[
    movie_data['rating'] >= 3.0]
reduced_movie = reduced_movie.drop(
    columns=['movieId','timestamp', 'genres'])

print(reduced_movie.head())
print()
print("Original Shape: {0}, New Shape: {1}".format(
    movie_data.shape, reduced_movie.shape))

   userId  rating           title
0       1     3.5  Jumanji (1995)
1       5     3.0  Jumanji (1995)
2      13     3.0  Jumanji (1995)
3      29     3.0  Jumanji (1995)
4      34     3.0  Jumanji (1995)

Original Shape: (20000263, 6), New Shape: (16486759, 3)


In [8]:
reduced_movie = reduced_movie[
    reduced_movie.groupby('title')['rating'].transform(
        'size') > 3000]

print(reduced_movie.groupby('title')[
    'rating'].count().sort_values().head())
print()
print("New shape: ", reduced_movie.shape)

title
Eastern Promises (2007)                                              3001
Triplets of Belleville, The (Les triplettes de Belleville) (2003)    3003
Bad Santa (2003)                                                     3006
Mexican, The (2001)                                                  3010
1984 (Nineteen Eighty-Four) (1984)                                   3010
Name: rating, dtype: int64

New shape:  (12083404, 3)


In [9]:
ratings = None
movies = None
movie_data = None

### Performing collaborative filtering

In [10]:
user_rating = pd.pivot_table(
    reduced_movie, 
    index='userId', 
    columns='title', 
    values='rating')

print(user_rating.head())

title   (500) Days of Summer (2009)  10 Things I Hate About You (1999)  \
userId                                                                   
1                               NaN                                NaN   
2                               NaN                                NaN   
3                               NaN                                NaN   
4                               NaN                                NaN   
5                               NaN                                NaN   

title   101 Dalmatians (1996)  \
userId                          
1                         NaN   
2                         NaN   
3                         NaN   
4                         NaN   
5                         NaN   

title   101 Dalmatians (One Hundred and One Dalmatians) (1961)  \
userId                                                           
1                                                     NaN        
2                                                  

In [11]:
YF_ratings = user_rating['Young Frankenstein (1974)']
print(YF_ratings.sort_values(ascending=False).head())

userId
60898     5.0
52548     5.0
101177    5.0
101198    5.0
28648     5.0
Name: Young Frankenstein (1974), dtype: float64


In [12]:
print(user_rating.corrwith(
    YF_ratings).sort_values(
    ascending=False).head())

title
Young Frankenstein (1974)                 1.000000
Blazing Saddles (1974)                    0.421143
Monty Python and the Holy Grail (1975)    0.300413
Producers, The (1968)                     0.297317
Magnificent Seven, The (1960)             0.291847
dtype: float64
