In [1]:
import pandas as pd

##### Đọc dữ liệu và Data Structures


In [7]:
import pandas as pd

movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')
links = pd.read_csv('ml-latest-small/links.csv')

print("Movies DataFrame:")
print(movies.head())
print(movies.info())

print("\nRatings DataFrame:")
print(ratings.head())
print(ratings.info())

print("\nTags DataFrame:")
print(tags.head())
print(tags.info())

print("\nLinks DataFrame:")
print(links.head())
print(links.info())


Movies DataFrame:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None

Ratings DataFrame:
   userId  movieId 

##### 2. Xử lý dữ liệu bị thiếu/không hợp lệ

In [8]:
print("\nMissing values in movies:")
print(movies.isnull().sum())

print("\nMissing values in ratings:")
print(ratings.isnull().sum())

print("\nMissing values in tags:")
print(tags.isnull().sum())

print("\nMissing values in links:")
print(links.isnull().sum())

movies.dropna(inplace=True)
ratings.dropna(inplace=True)
tags.dropna(inplace=True)
links.dropna(inplace=True)



Missing values in movies:
movieId    0
title      0
genres     0
dtype: int64

Missing values in ratings:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

Missing values in tags:
userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

Missing values in links:
movieId    0
imdbId     0
tmdbId     8
dtype: int64


##### 3. Gộp DataFrame

In [10]:
merged_data = pd.merge(movies, ratings, on='movieId', how='inner')
print("\nMerged Data (movies + ratings):")
print(merged_data.head())

merged_data = pd.merge(merged_data, tags, on=['movieId', 'userId'], how='left')
print("\nMerged Data (movies + ratings + tags):")
print(merged_data.head())


Merged Data (movies + ratings):
   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating   timestamp  
0       1     4.0   964982703  
1       5     4.0   847434962  
2       7     4.5  1106635946  
3      15     2.5  1510577970  
4      17     4.5  1305696483  

Merged Data (movies + ratings + tags):
   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|A

##### 4. Lọc dữ liệu theo yêu cầu
Ví dụ: Lọc các phim có rating >= 4.0.

In [11]:
high_rated_movies = merged_data[merged_data['rating'] >= 4.0]
print("\nMovies with rating >= 4.0:")
print(high_rated_movies[['title', 'rating']].head())


Movies with rating >= 4.0:
              title  rating
0  Toy Story (1995)     4.0
1  Toy Story (1995)     4.0
2  Toy Story (1995)     4.5
4  Toy Story (1995)     4.5
6  Toy Story (1995)     4.0


##### 5. Thống kê dữ liệu

In [12]:
from collections import Counter

genres = movies['genres'].str.split('|').explode()
genre_counts = Counter(genres)
print("\nThống kê số lượng phim theo thể loại:")
print(genre_counts)



Thống kê số lượng phim theo thể loại:
Counter({'Drama': 4361, 'Comedy': 3756, 'Thriller': 1894, 'Action': 1828, 'Romance': 1596, 'Adventure': 1263, 'Crime': 1199, 'Sci-Fi': 980, 'Horror': 978, 'Fantasy': 779, 'Children': 664, 'Animation': 611, 'Mystery': 573, 'Documentary': 440, 'War': 382, 'Musical': 334, 'Western': 167, 'IMAX': 158, 'Film-Noir': 87, '(no genres listed)': 34})


##### 6. Parsing Timestamps

In [13]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
tags['timestamp'] = pd.to_datetime(tags['timestamp'], unit='s')

print("\nThời gian đánh giá đầu tiên:", ratings['timestamp'].min())
print("Thời gian đánh giá cuối cùng:", ratings['timestamp'].max())



Thời gian đánh giá đầu tiên: 1996-03-29 18:36:55
Thời gian đánh giá cuối cùng: 2018-09-24 14:27:30
