# MOVIE LENS DATA PROCESSING

## 1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

## 2. Understanding the dataset

### 2.1 Import the Real Data

In [2]:
column_names = ['user_id', 'movie_id', 'rating', 'timestamp']
real_data = pd.read_csv('../data/real_data/u.data', sep='\t', names=column_names)


real_data.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/real_data/u.data'

### 2.2 Statistics of Dataset

In [4]:
number_of_columns = real_data.shape[1]
print('Number of columns:', number_of_columns)

number_of_rows = real_data.shape[0]
print('Number of rows:', number_of_rows)

Number of columns: 4
Number of rows: 100000


In [5]:
real_data.describe()

Unnamed: 0,user_id,movie_id,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [6]:
# Number of missing values
real_data.isnull().sum()

user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64

In [7]:
# Only show total number of duplicates
real_data.duplicated().sum()

np.int64(0)

In [8]:
# Print the data rows which are duplicated
real_data[real_data.duplicated(keep=False)]

Unnamed: 0,user_id,movie_id,rating,timestamp


### 2.3 Columnwise description

In [9]:
# Understanding User Column (Column 0)
unique_users = real_data['user_id'].unique()
min_value = np.min(unique_users)
max_value = np.max(unique_users)
print('Number of unique users:', len(unique_users))
print(f"Minimum value in column 0: {min_value}")
print(f"Maximum value in column 0: {max_value}")

Number of unique users: 943
Minimum value in column 0: 1
Maximum value in column 0: 943


In [15]:
# Understanding Movie Column (Column 1)
unique_movies = real_data['movie_id'].unique()
min_value = np.min(unique_movies)
max_value = np.max(unique_movies)
print('Number of unique movies:', len(unique_movies))
print(f"Minimum value in column 1: {min_value}")
print(f"Maximum value in column 1: {max_value}")

Number of unique movies: 1682
Minimum value in column 1: 1
Maximum value in column 1: 1682


In [16]:
# Understanding Rating Column (Column 2)
# Ratings description (from 1 to 5)

unique_ratings = real_data['rating'].unique()
min_value = np.min(unique_ratings)
max_value = np.max(unique_ratings)
print('Number of unique ratings:', len(unique_ratings))
print(f"Minimum value in column 2: {min_value}")
print(f"Maximum value in column 2: {max_value}")

Number of unique ratings: 5
Minimum value in column 2: 1
Maximum value in column 2: 5


In [17]:
# Understanding Timestamp Column (Column 3)

unique_timestamps = real_data['timestamp'].unique()
min_value = np.min(unique_timestamps)
max_value = np.max(unique_timestamps)
print('Number of unique timestamps:', len(unique_timestamps))
print(f"Minimum value in column 3: {min_value}")
print(f"Maximum value in column 3: {max_value}")

Number of unique timestamps: 49282
Minimum value in column 3: 874724710
Maximum value in column 3: 893286638


## 3. Export Dataset

In [None]:
real_data.to_csv('../data/ml100k/real_data/real_data.csv', index=False)