In [1]:
import pandas as pd
import numpy as np
import os

#### Original files

In [2]:
#actual path
fileDir = os.path.dirname(os.path.realpath('__file__'))
#correct path
movies_path = os.path.join(fileDir, '../processed_data/movies_content.csv')
ratings_path = os.path.join(fileDir, '../processed_data/ratings_content.csv')
tags_path = os.path.join(fileDir, '../data/tags.csv')

In [3]:
movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)
tags = pd.read_csv(tags_path)

In [4]:
print('movies: ', movies.shape)
print('ratings: ', ratings.shape)
print('tags: ', tags.shape)

movies:  (53889, 9)
ratings:  (27753444, 5)
tags:  (1108997, 4)


In [5]:
#Unique movie titles
movies.old_title.nunique()

53818

#### Subsetting movies

In [6]:
subset_movies_by_count = (ratings.groupby('movieId').count() > 2)['userId']
#Subsetting movies which have been rarely rated
movie_id = subset_movies_by_count.index[subset_movies_by_count]
# genome_scores = genome_scores[genome_scores.movieId.isin(movie_id)]
# links = links[links.movieId.isin(movie_id)]
movies = movies[movies.movieId.isin(movie_id)]
ratings = ratings[ratings.movieId.isin(movie_id)]
# tags = tags[tags.movieId.isin(movie_id)]

In [7]:
print('movies: ', movies.shape)
print('ratings: ', ratings.shape)
print('tags: ', tags.shape)

movies:  (37410, 9)
ratings:  (27730641, 5)
tags:  (1108997, 4)


#### Subsetting users

Filtering users with 4 standard deviations away from the median rating per person

In [8]:
median = ratings.groupby('userId')['rating'].count().median()
print('median: ', median)
iqr = ratings.groupby('userId')['rating'].count().quantile(.75) - ratings.groupby('userId')['rating'].count().quantile(.25)
print('iqr: ', iqr)
outliers = median +  5 * iqr ###only super extreme users
print('outliers: ', outliers)
x = ratings.groupby('userId')['rating'].count() > outliers
print('top {0} users contribute to ratings for {1} movies at a rate {2}'.format(len(x.index[x]), \
                                                ratings[ratings.userId.isin(x.index[x])].shape[0],\
                                        ratings[ratings.userId.isin(x.index[x])].shape[0]/len(x.index[x])    
                                                                              ))

median:  30.0
iqr:  80.0
outliers:  430.0
top 12892 users contribute to ratings for 10553200 movies at a rate 818.5851690971145


In [None]:
#Subsetting reviews whos reviewers have less than 430 reviews
subset_users_by_count = (ratings.groupby('userId')['rating'].count() < outliers)
user_id = subset_users_by_count.index[subset_users_by_count]
ratings = ratings[ratings.userId.isin(user_id)]
# tags = tags[tags.userId.isin(user_id)]

In [None]:
#Subsetting reviews whos reviewers have more than 2 reviews
subset_users_by_count_min = (ratings.groupby('userId')['rating'].count() > 2)
user_id_min = subset_users_by_count_min.index[subset_users_by_count_min]
ratings = ratings[ratings.userId.isin(user_id_min)]

In [None]:
ratings.shape

In [None]:
tags[tags.userId.isin(user_id)].shape

### Saving File

In [None]:
rating_preprocessed_path = os.path.join(fileDir, '../processed_data/ratings_4std.csv')    
ratings.to_csv(path_or_buf=rating_preprocessed_path, index=False)

### Visualizing rating per person after filtering

In [None]:
x = ratings['userId'].value_counts();

In [None]:
users = x.index[:]
counts = x.iloc[:]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.figure(figsize=(12,5))
# plt.hist(filtered_2, 50, density=False, facecolor='g', alpha=0.75)
sns.distplot(counts, 50, kde=False, color='blue')
plt.xlabel('Number of Movies Rated')
plt.ylabel('Number of Users')
plt.title('Distribution of User Behavior - Filtered')
#plt.axis([10, 100])
plt.grid(True)
plt.show()

In [None]:
pd.DataFrame(counts.value_counts()).reset_index().sort_values(by='index')