In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import json
from sklearn.model_selection import train_test_split

# Data Analysis (nans and duplicates and ...)

In [2]:
df = pd.read_csv("data/ratings.csv")
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [3]:
matrix = df.pivot(index='movieId', columns='userId', values='rating')

# Drop rows and columns where all values are NaN
matrix = matrix.dropna(axis=1, how='all')
matrix = matrix.dropna(axis=0, how='all')

In [4]:
matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,,,,,,,,,,...,,,,,,,,,,
193583,,,,,,,,,,,...,,,,,,,,,,
193585,,,,,,,,,,,...,,,,,,,,,,
193587,,,,,,,,,,,...,,,,,,,,,,


In [5]:
# Count missing values
na_count = matrix.isna().sum().sum()

# Total number of cells
total_cells = matrix.size

# Proportion of missing values
na_proportion = na_count / total_cells

print(f"Count of NaN: {na_count}")
print(f"Total cells: {total_cells}")
print(f"Number of non Nan cells: {total_cells - na_count}")
print(f"Proportion of NaN: {na_proportion:.5f}")

Count of NaN: 5830804
Total cells: 5931640
Number of non Nan cells: 100836
Proportion of NaN: 0.98300


In [6]:
# delete ratings here if needed

In [7]:
print(matrix.shape)
matrix.iloc[:5, :5]

(9724, 610)


userId,1,2,3,4,5
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4.0,,,,4.0
2,,,,,
3,4.0,,,,
4,,,,,
5,,,,,


# Saving files

save all ratings in a file called `ratings.csv`

In [8]:
df.to_csv("data/data.csv", index=False)

save all movies and users ids encoded in a file called `data_ids.json`

In [9]:
moviesIDs = {int(number): index for index, number in enumerate(sorted(df['movieId'].unique()))}
userIDs = {int(number): index for index, number in enumerate(sorted(df['userId'].unique()))}

ids = {"moviesIDs": moviesIDs, "userIDs": userIDs}

with open("data/data_ids.json", 'w') as file:
    json.dump(ids, file, indent=4)

split the data into training and testing sets and save them in files called `data_train.csv` and `data_test.csv`

In [10]:
df_80, df_20 = train_test_split(df, test_size=0.2, random_state=42)

#print(set(df_20["userId"]).issubset(set(df_80["userId"])))
#print(set(df_20["movieId"]).issubset(set(df_80["movieId"])))

df_80.to_csv("data/data_train.csv", index=False)
df_20.to_csv("data/data_test.csv", index=False)

read the data from the files to make sure that everything is saved correctly

In [11]:
#pd.read_csv("data/data.csv")

#with open("data/data_ids.json") as f:
#    ids = json.load(f)
#ids.keys()

df_80 = pd.read_csv("data/data_train.csv")
df_20 = pd.read_csv("data/data_test.csv")
print(df_80.shape, df_20.shape) # qnt de ratings
print(df_80['userId'].nunique(), df_20['userId'].nunique()) # qnt de users
print(df_80['movieId'].nunique(), df_20['movieId'].nunique()) # qnt de filmes

(80668, 4) (20168, 4)
610 610
8983 5142
