In [185]:
import pandas as pd

# Import Data / Transform into Dataframes

In [186]:
users_df = pd.read_csv('data/users.dat', delimiter='::', engine='python', names=['uid', 'gender', 'age', 'occupation', 'zip'])
movies_df = pd.read_csv('data/movies.dat', delimiter='::', engine='python', names=['movie_id', 'title', 'genre'], encoding='latin-1')
ratings_df = pd.read_csv('data/ratings.dat', delimiter='::', engine='python', names=['uid', 'movie_id', 'rating', 'ts'])

In [187]:
users_df.head()

Unnamed: 0,uid,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [188]:
movies_df.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [189]:
ratings_df.head()

Unnamed: 0,uid,movie_id,rating,ts
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


# Join Dataframes

In [190]:
user_ratings_df = pd.merge(ratings_df, users_df, on='uid')
df = pd.merge(user_ratings_df, movies_df, on='movie_id')
df.head()

Unnamed: 0,uid,movie_id,rating,ts,gender,age,occupation,zip,title,genre
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy


# Process Dataset

In [191]:
df.sort_values(by='ts', inplace=True)
df

Unnamed: 0,uid,movie_id,rating,ts,gender,age,occupation,zip,title,genre
1000138,6040,858,4,956703932,M,25,6,11106,"Godfather, The (1972)",Action|Crime|Drama
1000153,6040,2384,4,956703954,M,25,6,11106,Babe: Pig in the City (1998),Children's|Comedy
999873,6040,593,5,956703954,M,25,6,11106,"Silence of the Lambs, The (1991)",Drama|Thriller
1000007,6040,1961,4,956703977,M,25,6,11106,Rain Man (1988),Drama
1000192,6040,2019,5,956703977,M,25,6,11106,Seven Samurai (The Magnificent Seven) (Shichin...,Action|Drama
...,...,...,...,...,...,...,...,...,...,...
825793,4958,2399,1,1046454338,M,18,7,55403,Santa Claus: The Movie (1985),Adventure|Children's|Fantasy
825438,4958,1407,5,1046454443,M,18,7,55403,Scream (1996),Horror|Thriller
825724,4958,3264,4,1046454548,M,18,7,55403,Buffy the Vampire Slayer (1992),Comedy|Horror
825731,4958,2634,3,1046454548,M,18,7,55403,"Mummy, The (1959)",Horror


## Bin Age Groups + One Hot Encoding for age_group and gender

In [192]:
df = df.reset_index(drop=True)
bins = [0, 12, 19, 60, float('inf')]
labels = ['child', 'teen', 'adult', 'senior']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)
df = pd.get_dummies(df, columns=['age_group', 'gender'])

## One Hot Encoding for Genres

In [193]:
df['genres_list'] = df['genre'].str.split('|')
df_exploded = df.explode('genres_list')
one_hot = pd.get_dummies(df_exploded['genres_list'])
one_hot_exploded = one_hot.groupby(df_exploded.index).max()
df = pd.concat([df, one_hot_exploded], axis=1)

In [194]:
print(df.iloc[0])

uid                                   6040
movie_id                               858
rating                                   4
ts                               956703932
age                                     25
occupation                               6
zip                                  11106
title                Godfather, The (1972)
genre                   Action|Crime|Drama
age_group_child                      False
age_group_teen                       False
age_group_adult                       True
age_group_senior                     False
gender_F                             False
gender_M                              True
genres_list         [Action, Crime, Drama]
Action                                True
Adventure                            False
Animation                            False
Children's                           False
Comedy                               False
Crime                                 True
Documentary                          False
Drama      

In [195]:
df.drop(columns=['zip', 'age', 'genres_list', 'genre'], inplace=True, axis=1)
df = df[df['rating'] != 3]

## Bin ratings

In [196]:
print(df.columns)

Index(['uid', 'movie_id', 'rating', 'ts', 'occupation', 'title',
       'age_group_child', 'age_group_teen', 'age_group_adult',
       'age_group_senior', 'gender_F', 'gender_M', 'Action', 'Adventure',
       'Animation', 'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')


In [197]:
df['rating'].value_counts()

rating
4    348971
5    226310
2    107557
1     56174
Name: count, dtype: int64

In [198]:
df['rating'] = df['rating'].mask(df['rating'] < 3, 0)
df['rating'] = df['rating'].mask(df['rating'] > 3, 1)
df

Unnamed: 0,uid,movie_id,rating,ts,occupation,title,age_group_child,age_group_teen,age_group_adult,age_group_senior,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,6040,858,1,956703932,6,"Godfather, The (1972)",False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,6040,2384,1,956703954,6,Babe: Pig in the City (1998),False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,6040,593,1,956703954,6,"Silence of the Lambs, The (1991)",False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
3,6040,1961,1,956703977,6,Rain Man (1988),False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,6040,2019,1,956703977,6,Seven Samurai (The Magnificent Seven) (Shichin...,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000203,4958,3489,1,1046454320,7,Hook (1991),False,True,False,False,...,True,False,False,False,False,False,False,False,False,False
1000204,4958,2399,0,1046454338,7,Santa Claus: The Movie (1985),False,True,False,False,...,True,False,False,False,False,False,False,False,False,False
1000205,4958,1407,1,1046454443,7,Scream (1996),False,True,False,False,...,False,False,True,False,False,False,False,True,False,False
1000206,4958,3264,1,1046454548,7,Buffy the Vampire Slayer (1992),False,True,False,False,...,False,False,True,False,False,False,False,False,False,False


In [199]:
df['rating'].value_counts()

rating
1    575281
0    163731
Name: count, dtype: int64