In [2]:
import pandas as pd
import os
import matplotlib as plt

In [3]:
ratings = pd.read_csv('ratings.csv', low_memory = False)

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [5]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [6]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [8]:
#groupby the movie field and count the groups
ratings_counts =  ratings.groupby(['movieId', 'rating'], as_index = False).count()

In [9]:
ratings_counts

Unnamed: 0,movieId,rating,userId,timestamp
0,1,0.5,441,441
1,1,1.0,804,804
2,1,1.5,438,438
3,1,2.0,2083,2083
4,1,2.5,1584,1584
...,...,...,...,...
239376,176267,4.0,1,1
239377,176269,3.5,1,1
239378,176271,5.0,1,1
239379,176273,1.0,1,1


In [17]:
#rename user ID to count & change to movieID to index with .pivot
rating_counts = ratings.groupby(['movieId','rating'], as_index=False).count() \
                .rename({'userId':'count'}, axis=1).pivot(index = 'movieId', columns = 'rating', values = 'count')

In [18]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [20]:
#rename columns to prepend with rating
rating_counts.columns = ['rating_' + str(col) for col in rating_counts.columns]

In [21]:
rating_counts.columns

Index(['rating_0.5', 'rating_1.0', 'rating_1.5', 'rating_2.0', 'rating_2.5',
       'rating_3.0', 'rating_3.5', 'rating_4.0', 'rating_4.5', 'rating_5.0'],
      dtype='object')

In [22]:
#merge rating counts with columns
movies_df = pd.read_csv('final_movies.csv', low_memory = False)

In [23]:
movies_df.columns

Index(['Unnamed: 0', 'imdb_id', 'kaggle_id', 'title', 'original_title',
       'tagline', 'belongs_to_collection', 'wikipedia_url', 'imdb_link',
       'runtime', 'budget', 'revenue', 'release_date', 'popularity',
       'vote_average', 'vote_count', 'genres', 'original_language', 'overview',
       'spoken_languages', 'country', 'production_companies',
       'production_countries', 'distributor', 'producers', 'director',
       'starring', 'cinematography', 'editors', 'writers', 'composers',
       'based_on'],
      dtype='object')

In [28]:
#use a left merge to merge ratings and movies df
movies_with_ratings_df = pd.merge(movies_df, rating_counts, left_on = 'kaggle_id', right_index = True, how = 'left')

In [30]:
#fill missing ratings with 0
movies_with_ratings_df[rating_counts.columns] = movies_with_ratings_df[rating_counts.columns].fillna(0)