In [125]:
# For Python 3.13 compatibility, we'll skip opendatasets and use the existing dataset
# The MovieLens dataset is already available in the movielens-100k-dataset folder
import os
print("Using existing MovieLens dataset from movielens-100k-dataset folder")
print("Dataset files are already available in the workspace")

Using existing MovieLens dataset from movielens-100k-dataset folder
Dataset files are already available in the workspace


In [126]:
#from google.colab import drive
#drive.mount('/content/drive')

In [127]:
import os   # Used to read the images path from the directory
import re

# Data handling
import pandas as pd # Used to read/create dataframes (csv) and process tabular data
import numpy as np  # preprocessing and numerical/mathematical operations

# Visualization
import matplotlib.pyplot as plt # Used for visualizing the images and plotting the training progress
import seaborn as sns

# Preprocessing & similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [128]:
# Use relative paths for the local dataset
base_dir = os.path.join('movielens-100k-dataset','ml-100k')
ratings_path = os.path.join(base_dir,'u.data')
movies_path = os.path.join(base_dir,'u.item')
users_path = os.path.join(base_dir,'u.user')

print(f"Base directory: {base_dir}")
print(f"Ratings file: {ratings_path}")
print(f"Movies file: {movies_path}")
print(f"Users file: {users_path}")

Base directory: movielens-100k-dataset\ml-100k
Ratings file: movielens-100k-dataset\ml-100k\u.data
Movies file: movielens-100k-dataset\ml-100k\u.item
Users file: movielens-100k-dataset\ml-100k\u.user


## Data preparations

In [129]:
genre_cols = [
    "unknown","Action","Adventure","Animation","Children's","Comedy","Crime","Documentary",
    "Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western"
]
movie_cols = ["movieId","title","release_date","video_release_date","IMDb_URL"] + genre_cols

movies = pd.read_csv(movies_path, sep="|", encoding="latin-1", header=None, names=movie_cols)

ratings = pd.read_csv(ratings_path, sep="\t", names=["userId","movieId","rating","timestamp"])

users = pd.read_csv(users_path, sep="|", header=None, names=["userId","age","gender","occupation","zip_code"])

# Drop columns we don’t need
movies = movies.drop(["video_release_date", "IMDb_URL"], axis=1)

print("Prepared Ratings Shape:", ratings.shape)
print("Prepared Movies Shape:", movies.shape)


Prepared Ratings Shape: (100000, 4)
Prepared Movies Shape: (1682, 22)


In [130]:
print(ratings.isnull().sum())
print(movies.isnull().sum())

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
movieId         0
title           0
release_date    1
unknown         0
Action          0
Adventure       0
Animation       0
Children's      0
Comedy          0
Crime           0
Documentary     0
Drama           0
Fantasy         0
Film-Noir       0
Horror          0
Musical         0
Mystery         0
Romance         0
Sci-Fi          0
Thriller        0
War             0
Western         0
dtype: int64


In [131]:
def extract_year(title: str):
    if not isinstance(title, str):
        return np.nan
    m = re.search(r"\((\d{4})\)", title)
    return int(m.group(1)) if m else np.nan

In [132]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [133]:
movies.head()

Unnamed: 0,movieId,title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [134]:
def row_genres(row):
    g = [g for g in genre_cols if row[g] == 1]
    return "  ".join(g) if g else "unknown"



In [135]:
movies["genres"] = movies.apply(row_genres, axis=1)
movies["year"] = movies["title"].apply(extract_year)

# làm sạch title: bỏ "(1995)" nếu muốn
movies["title_clean"] = movies["title"].str.replace(r"\s*\(\d{4}\)\s*$", "", regex=True)

# DataFrame đầu ra
movies_df = movies[["movieId", "title_clean", "genres", "year"]].rename(columns={"title_clean": "title"})
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Animation Children's Comedy,1995.0
1,2,GoldenEye,Action Adventure Thriller,1995.0
2,3,Four Rooms,Thriller,1995.0
3,4,Get Shorty,Action Comedy Drama,1995.0
4,5,Copycat,Crime Drama Thriller,1995.0


In [136]:
# Create processed directory in the current working directory
out_dir = "processed"
os.makedirs(out_dir, exist_ok=True)

movies_csv = os.path.join(out_dir, "movies_preprocessed.csv")
ratings_csv = os.path.join(out_dir, "ratings.csv")
users_csv = os.path.join(out_dir, "users.csv")

movies_df.to_csv(movies_csv, index=False, encoding="utf-8")
ratings.to_csv(ratings_csv, index=False)
users.to_csv(users_csv, index=False)
print("Saved:", movies_csv)
print("Saved:", ratings_csv)
print("Saved:", users_csv)

Saved: processed\movies_preprocessed.csv
Saved: processed\ratings.csv
Saved: processed\users.csv


In [137]:
#lấy dữ liệu phim theo id / tìm theo tên
def get_movie_data(movie_id: int):
    return movies_df[movies_df["movieId"] == movie_id].reset_index(drop=True)

def search_movie_by_title(q: str, top_n: int = 10):
    q_norm = q.strip().lower()
    m = movies_df[movies_df["title"].str.lower().str.contains(q_norm, na=False)]
    return m.head(top_n).reset_index(drop=True)


display(search_movie_by_title("Toy Story"))
display(get_movie_data(1))

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Animation Children's Comedy,1995.0


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Animation Children's Comedy,1995.0


In [138]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [139]:
movies[['title_clean', 'genres']].tail()

Unnamed: 0,title_clean,genres
1677,Mat' i syn,Drama
1678,B. Monkey,Romance Thriller
1679,Sliding Doors,Drama Romance
1680,You So Crazy,Comedy
1681,Scream of Stone (Schrei aus Stein),Drama


In [140]:
# Dùng TF-IDF để biến thể loại thành vector
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])

print("Kích thước ma trận TF-IDF:", tfidf_matrix.shape)


Kích thước ma trận TF-IDF: (1682, 21)


In [141]:
# Ma trận cosine similarity giữa các phim
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print("Kích thước ma trận similarity:", cosine_sim.shape)


Kích thước ma trận similarity: (1682, 1682)


In [None]:
def recommend(title, top_n=10):
    # Mapping từ tên phim sang index
    indices = pd.Series(movies.index, index=movies['title_clean']).drop_duplicates()

    if title not in indices:
        return f"Phim '{title}' không có trong dataset."

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Bỏ chính nó
    movie_indices = [i[0] for i in sim_scores]

    return movies.iloc[movie_indices][['title_clean', 'genres']]


In [None]:
# Thử gợi ý phim tương tự "Toy Story (1995)"
recommend("Toy Story", top_n=5)


Unnamed: 0,title,genres
421,Aladdin and the King of Thieves (1996),Animation Children's Comedy
101,"Aristocats, The (1970)",Animation Children's
403,Pinocchio (1940),Animation Children's
624,"Sword in the Stone, The (1963)",Animation Children's
945,"Fox and the Hound, The (1981)",Animation Children's
