# This notebook will explore EDA and Viz or our data set

### imports

In [None]:
# imports 
import pandas as pd
import pandas_profiling
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('white')


In [None]:
# plot parameters
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['axes.titlesize'] = 25
plt.rcParams['xtick.labelsize'] = 18
plt.rcParams['ytick.labelsize'] = 18
plt.rcParams['axes.edgecolor'] = 'black'
plt.rcParams['axes.facecolor'] = 'white' # or EAEAF2
plt.rcParams['font.size'] = 16

### adding a function for bar plots

In [None]:
def barh(x, y, title, xlabel):
    fig = plt.subplots(figsize=(20,16))
    
    for index, value in enumerate(y):
        plt.text(value, index, str(value))
    plt.barh(x, y, color='#99BDCB')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.show();

### read in the cleaned and merged csv from the data folder

In [None]:
movies = pd.read_csv('../../../data/joined_dfs_lc')
movies

### make a dataframe with average rating for each title

In [None]:
rated = pd.DataFrame(movies.groupby('title')['rating'].mean())
rated.sort_values('rating', ascending=False)

In [None]:
rated.columns

#### add the count of ratings for each title

In [None]:
rated['num_rating'] = pd.DataFrame(movies.groupby('title')['rating'].count())
rated.head()

#### histogram showing distribution of ratings

In [None]:
fig = plt.subplots(figsize=(10,8))
plt.hist(rated.rating, bins = 10, color='#99BDCB')
plt.xticks([0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5])
plt.title('Distribution of Mean Ratings')
plt.xlabel('Rating Scale')
plt.ylabel('Number of Ratings')
plt.show()

### some visualizing

In [None]:
fig = plt.subplots(figsize=(20,16))
plt.bar(rated.rating, rated.num_rating, color='#99BDCB')
plt.title('Distribution of Ratings per Movie')
plt.xlabel('Average Ratings')
plt.ylabel('Number of Ratings')
plt.show()

In [None]:
fig = plt.subplots(figsize=(20,16))
plt.scatter(rated.rating, rated.num_rating, c='#99BDCB')
plt.title('Distribution of Ratings per Movie')
plt.xlabel('Rating Scale')
plt.ylabel('Number of Ratings')
plt.show()

#### show the  10 most rated titles

In [None]:
most_rated = rated.sort_values('num_rating', ascending=False)[:10]
most_rated

#### here are the Top 10 Rated movies visualized

In [None]:
barh(most_rated.index, most_rated.num_rating, '10 Most Rated Movies', 'Number of Ratings')

#### and the average rating for the 10 Most Rated Movies

In [None]:
barh(most_rated.index, round(most_rated.rating,2), '10 Most Rated Movies', 'Average Ratings')

how about the least rated movies?

In [None]:
least_rated = rated.sort_values('num_rating', ascending=True)[:8000]
least_rated

### how many different genres are represented, and what are the most rated genres?

In [None]:
genre = pd.DataFrame(movies.groupby('genres')['title'].count())
genre.info()

In [None]:
most_rated_genre = genre.sort_values('title', ascending=False)[:10]
most_rated_genre

In [None]:
barh(most_rated_genre.index, most_rated_genre.title, '10 Most Rated Genres', 'Number of Ratings')

### let's look at users

In [None]:
users = pd.DataFrame(movies.groupby('userId')['rating'].count())
users.shape

It looks like we have 610 users.

In [None]:
users.sort_values('rating', ascending=False)[:20]

the "top 12" users have each rated over 1000 movies.

In [None]:
users.sort_values('rating', ascending=True)[:75]

On the flip side around 75 users have rated 25 movies or fewer

In [None]:
fig = plt.subplots(figsize=(20,16))
plt.hist(users.rating, bins = 50,color='#789698')
plt.title('Number of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Users')
plt.show();

That is very hard to read. Most users have 20-50ish ratings. A few have rated thousands of movies.

I'm going to see if I can make these a little easier to view...

In [None]:
fig = plt.subplots(figsize=(11,7))
plt.hist(users.rating, bins = 12, range=(0,250),color='#789698')
plt.title('Number of Ratings per User (under 250)')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Users')
plt.tight_layout()
#plt.savefig('Users with fewer than 250 ratings.png')
plt.show();

In [None]:
fig = plt.subplots(figsize=(11,7))
plt.hist(users.rating, bins = 20, range=(500,2700),color='#789698')
plt.title('Number of Ratings per User (over 500)')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Users')
plt.tight_layout()
#plt.savefig('Users with more than 500 ratings.png')
plt.show();

Should we look at adding a minimum threshold for number of movies rated by a user, and/or minimum number of reviews per movie to include in our model?

In order to handle outliers, I will filter out the movies that have fewer than 10 ratings, and users who rated fewer than 30 movies

In [None]:
min_movie_ratings = 10

filter_movies = movies['title'].value_counts() > min_movie_ratings
filter_movies = filter_movies[filter_movies].index.tolist()



min_user_ratings = 30
filter_users = movies['userId'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()



df_new = movies[(movies['title'].isin(filter_movies)) & (movies['userId'].isin(filter_users))]

print('The original data frame shape:\t{}'.format(movies.shape))

print('The new data frame shape:\t{}'.format(df_new.shape))

#### On second thought... this is backwards. I need to keep the entire dataset and try to level the playing field between "more popular" and "less popular" items.

This is the long tail problem. We don't want to just return the most popular items. If a movie with fewer ratings matches our user's profile, we want that recommendation to show up.
