In [1]:
import pandas as pd
import pyarrow as pa
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
genome_scores = pd.read_csv(r'F:\Development\Data Glacier Internship\Week7\ml-latest\genome-scores.csv', low_memory=False)
movies = pd.read_csv(r'F:\Development\Data Glacier Internship\Week7\ml-latest\movies.csv', low_memory=False)
ratings = pd.read_csv(r'F:\Development\Data Glacier Internship\Week7\ml-latest\ratings.csv', low_memory=False)

In [3]:
genome_scores_copy = genome_scores.copy()
movies_copy = movies.copy()
ratings_copy = ratings.copy()

## Genome Scores

In [4]:
genome_scores_copy.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.032
1,1,2,0.02225
2,1,3,0.07
3,1,4,0.059
4,1,5,0.123


In [5]:
genome_scores_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18472128 entries, 0 to 18472127
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   tagId      int64  
 2   relevance  float64
dtypes: float64(1), int64(2)
memory usage: 422.8 MB


In [6]:
genome_scores_copy.isnull().sum()

movieId      0
tagId        0
relevance    0
dtype: int64

In [7]:
genome_scores_copy.duplicated().sum()

0

In [8]:
# Check the outliers
print(genome_scores_copy['relevance'].min(), genome_scores_copy['relevance'].max())

0.0002499999999999 1.0


Conclusion: No outliers.

## Movies

In [9]:
movies_copy.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
movies_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  86537 non-null  int64 
 1   title    86537 non-null  object
 2   genres   86537 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


In [11]:
movies_copy.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [12]:
movies_copy.duplicated().sum()

0

In [13]:
# Encode genres
movies_copy['genres'] = movies_copy['genres'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(movies_copy['genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)
movies_encoded = pd.concat([movies_copy.drop('genres', axis=1), genres_df], axis=1)

In [14]:
movies_encoded.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
movies_encoded = movies_encoded.drop(columns='(no genres listed)')

In [16]:
# Extract year from title
movies_encoded['release year'] = movies_copy['title'].str.extract(r'\((\d{4})\)')
movies_encoded['release year'].head()

0    1995
1    1995
2    1995
3    1995
4    1995
Name: release year, dtype: object

In [17]:
movies_encoded['release year'].isnull().sum()

618

Note: There are misiing values in the 'release year' column.

In [18]:
movies_encoded['release year'] = movies_encoded['release year'].fillna(1900)                                                         

In [19]:
movies_for_merging = movies_encoded.drop(columns='title')
movies_for_merging.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release year
0,1,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
1,2,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
2,3,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1995
3,4,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1995
4,5,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


## Ratings

In [20]:
ratings_copy.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [21]:
ratings_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [22]:
ratings_copy.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [23]:
ratings_copy.duplicated().sum()

0

In [24]:
# Check the outliers
print(ratings_copy['rating'].min(), ratings_copy['rating'].max())

0.5 5.0


Conclusion: No outliers.

In [25]:
ratings_for_merging = ratings_copy.drop(columns='timestamp')
ratings_for_merging.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,110,4.0
2,1,158,4.0
3,1,260,4.5
4,1,356,5.0


In [26]:
# Filter DataFrame to keep only rows with userIds who left at least 5 reviews
user_review_counts = ratings_for_merging.groupby('userId').size()
valid_user_ids = user_review_counts[user_review_counts >= 5].index
filtered_ratings = ratings_for_merging[ratings_for_merging['userId'].isin(valid_user_ids)]

# Creating a Master Data 

In [27]:
master_data = pd.merge(filtered_ratings, movies_for_merging, on='movieId')
master_data.head()

Unnamed: 0,userId,movieId,rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release year
0,1,1,4.0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1995
1,1,110,4.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1995
2,1,158,4.0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
3,1,260,4.5,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1977
4,1,356,5.0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,1,0,1994


In [28]:
master_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33779931 entries, 0 to 33779930
Data columns (total 23 columns):
 #   Column        Dtype  
---  ------        -----  
 0   userId        int64  
 1   movieId       int64  
 2   rating        float64
 3   Action        int32  
 4   Adventure     int32  
 5   Animation     int32  
 6   Children      int32  
 7   Comedy        int32  
 8   Crime         int32  
 9   Documentary   int32  
 10  Drama         int32  
 11  Fantasy       int32  
 12  Film-Noir     int32  
 13  Horror        int32  
 14  IMAX          int32  
 15  Musical       int32  
 16  Mystery       int32  
 17  Romance       int32  
 18  Sci-Fi        int32  
 19  Thriller      int32  
 20  War           int32  
 21  Western       int32  
 22  release year  object 
dtypes: float64(1), int32(19), int64(2), object(1)
memory usage: 3.4+ GB


In [29]:
genres_cols = master_data.select_dtypes(include=['int32']).columns
master_data[genres_cols] = master_data[genres_cols].astype('int8')
master_data['userId'] = master_data['userId'].astype('int32')
master_data['movieId'] = master_data['movieId'].astype('int32')
master_data['rating'] = master_data['rating'].astype('float16')
master_data['release year'] = master_data['release year'].astype('int16')

In [30]:
master_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33779931 entries, 0 to 33779930
Data columns (total 23 columns):
 #   Column        Dtype  
---  ------        -----  
 0   userId        int32  
 1   movieId       int32  
 2   rating        float16
 3   Action        int8   
 4   Adventure     int8   
 5   Animation     int8   
 6   Children      int8   
 7   Comedy        int8   
 8   Crime         int8   
 9   Documentary   int8   
 10  Drama         int8   
 11  Fantasy       int8   
 12  Film-Noir     int8   
 13  Horror        int8   
 14  IMAX          int8   
 15  Musical       int8   
 16  Mystery       int8   
 17  Romance       int8   
 18  Sci-Fi        int8   
 19  Thriller      int8   
 20  War           int8   
 21  Western       int8   
 22  release year  int16  
dtypes: float16(1), int16(1), int32(2), int8(19)
memory usage: 998.7 MB


Note: Unfortunately, we weren't able to allocate enough memory to merge master_data with genome_scores, therefore we will use only movies and ratings files in our project.

In [None]:
master_data.to_parquet('movie_ratings.parquet')