# Imports

In [1]:
import pandas as pd
from pathlib import Path
from src.utils import decode_string

# Data location

In [4]:
DATA_DIR = Path("../data/movie-lens-1m/ml-1m")
MOVIES_FILE_PATH = DATA_DIR / Path("movies.dat")
RATINGS_FILE_PATH = DATA_DIR / Path("ratings.dat")
USERS_FILE_PATH = DATA_DIR / Path("users.dat")

# Converting .dat files into .csv format

## $\text{movies.dat}$

In [5]:
movies_dict = {
    "MovieID": list(),
    "Title": list(),
    "Genres": list()
}
with open(MOVIES_FILE_PATH, "rb") as f:
    while True:
        line = f.readline()
        if line:
            line = decode_string(line)
            line = line.replace("\n", "")
            movie_id, title, genres = line.split("::")
            movie_id = int(movie_id)
            movies_dict["MovieID"].append(movie_id)
            movies_dict["Title"].append(title)
            movies_dict["Genres"].append(genres)
        else:
            break

movies_df = pd.DataFrame.from_dict(movies_dict)

In [6]:
print(len(movies_df))
movies_df.head()

3883


Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
movies_df.to_csv(DATA_DIR / Path("movies.csv"), index=False)

## $\text{ratings.dat}$

In [8]:
ratings_dict = {
    "UserID": list(),
    "MovieID": list(),
    "Rating": list(),
    "Timestamp": list()
}
with open(RATINGS_FILE_PATH, "rb") as f:
    while True:
        line = f.readline()
        if line:
            line = decode_string(line)
            line = line.replace("\n", "")
            user_id, movie_id, rating, timestamp = line.split("::")
            user_id = int(user_id)
            movie_id = int(movie_id)
            ratings_dict["UserID"].append(user_id)
            ratings_dict["MovieID"].append(movie_id)
            ratings_dict["Rating"].append(rating)
            ratings_dict["Timestamp"].append(timestamp)
        else:
            break

ratings_df = pd.DataFrame.from_dict(ratings_dict)

In [9]:
print(len(ratings_df))
ratings_df.head()

1000209


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [10]:
ratings_df.to_csv(DATA_DIR / Path("ratings.csv"), index=False)

## $\text{users.dat}$

In [11]:
users_dict = {
    "UserID": list(),
    "Gender": list(),
    "Age": list(),
    "Occupation": list(),
    "ZipCode": list()
}
with open(USERS_FILE_PATH, "rb") as f:
    while True:
        line = f.readline()
        if line:
            line = decode_string(line)
            line = line.replace("\n", "")
            user_id, gender, age, occupation, zip_code = line.split("::")
            user_id = int(user_id)
            age = int(age)
            occupation = int(occupation)
            users_dict["UserID"].append(user_id)
            users_dict["Gender"].append(gender)
            users_dict["Age"].append(age)
            users_dict["Occupation"].append(occupation)
            users_dict["ZipCode"].append(zip_code)
        else:
            break

users_df = pd.DataFrame.from_dict(users_dict)

In [12]:
print(len(users_df))
users_df.head()

6040


Unnamed: 0,UserID,Gender,Age,Occupation,ZipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [13]:
users_df.to_csv(DATA_DIR / Path("users.csv"), index=False)

# Merging

In [14]:
ratings_with_movies_df = ratings_df.merge(movies_df, on="MovieID", how="left")
merged_df = ratings_with_movies_df.merge(users_df, on="UserID", how="left")

In [15]:
merged_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Gender,Age,Occupation,ZipCode
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,Weekend at Bernie's (1989),Comedy,M,25,6,11106
1000205,6040,1094,5,956704887,"Crying Game, The (1992)",Drama|Romance|War,M,25,6,11106
1000206,6040,562,5,956704746,Welcome to the Dollhouse (1995),Comedy|Drama,M,25,6,11106
1000207,6040,1096,4,956715648,Sophie's Choice (1982),Drama,M,25,6,11106


In [16]:
merged_df.to_csv(DATA_DIR / Path("movie_lens.csv"), index=False)