In [1]:
import pandas as pd
import gc

In [2]:
chunksize = 100_000  # 每次讀10萬筆

# 建立空 list 收集符合條件的資料
filtered_chunks = []

for chunk in pd.read_csv("title.basics.tsv.gz", sep='\t', dtype=str, na_values='\\N', chunksize=chunksize):
    chunk = chunk[(chunk['titleType'] == 'movie') & (chunk['startYear'].notna())]
    
    # 有些 startYear 非數字，轉換時需處理
    chunk = chunk[chunk['startYear'].str.isnumeric()]
    chunk['startYear'] = chunk['startYear'].astype(int)
    filtered_chunks.append(chunk)

# 合併所有過濾後的 chunk
basics_filtered = pd.concat(filtered_chunks, ignore_index=True)
print(f"載入後共有 {len(basics_filtered)} 筆符合條件的電影")

載入後共有 613862 筆符合條件的電影


In [3]:
basics_filtered

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
1,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
2,tt0000502,movie,Bohemios,Bohemios,0,1905,,100,
3,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
4,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
...,...,...,...,...,...,...,...,...,...
613857,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
613858,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary
613859,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,,,Comedy
613860,tt9916730,movie,6 Gunn,6 Gunn,0,2017,,116,Drama


In [4]:
# 載入評分資料
ratings = pd.read_csv("title.ratings.tsv.gz", sep='\t', dtype={'tconst': str})

In [5]:
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2164
1,tt0000002,5.5,296
2,tt0000003,6.5,2219
3,tt0000004,5.3,190
4,tt0000005,6.2,2959
...,...,...,...
1589960,tt9916846,5.8,6
1589961,tt9916848,5.8,6
1589962,tt9916850,5.8,6
1589963,tt9916852,5.8,6


In [6]:
# 只取需要的欄位
basics_cols = ['tconst', 'primaryTitle', 'originalTitle', 'startYear', 'genres']

# left join，把電影資訊串進來
result = pd.merge(
    ratings,
    basics_filtered[basics_cols],
    on='tconst',
    how='inner'  # 只保留有評分的電影
)

# 顯示結果
result

Unnamed: 0,tconst,averageRating,numVotes,primaryTitle,originalTitle,startYear,genres
0,tt0000009,5.4,227,Miss Jerry,Miss Jerry,1894,Romance
1,tt0000147,5.3,563,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,1897,"Documentary,News,Sport"
2,tt0000502,3.6,22,Bohemios,Bohemios,1905,
3,tt0000574,6.0,1006,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,"Action,Adventure,Biography"
4,tt0000591,5.4,33,The Prodigal Son,L'enfant prodigue,1907,Drama
...,...,...,...,...,...,...,...
332488,tt9916362,6.4,6078,Coven,Akelarre,2020,"Drama,History"
332489,tt9916428,4.7,22,The Secret of China,Hong xing zhao yao Zhong guo,2019,"Adventure,History,War"
332490,tt9916538,7.6,12,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,2019,Drama
332491,tt9916706,7.7,9,Dankyavar Danka,Dankyavar Danka,2013,Comedy


In [7]:
del ratings
del basics_filtered
gc.collect()

0

In [8]:
result = result[(result['numVotes']>1500) & (result['averageRating']>6.5)]
result = result.drop(columns=['tconst', 'originalTitle'])
result

Unnamed: 0,averageRating,numVotes,primaryTitle,startYear,genres
62,7.0,3854,Dante's Inferno,1911,"Adventure,Drama,Fantasy"
99,6.9,2646,Fantômas: In the Shadow of the Guillotine,1913,"Crime,Drama"
108,7.0,1542,Ingeborg Holm,1913,Drama
111,6.9,1796,Fantomas: The Man in Black,1913,"Crime,Drama"
175,7.1,4190,Cabiria,1914,"Adventure,Drama,History"
...,...,...,...,...,...
332358,6.6,3373,Mogul Mowgli,2020,"Drama,Music"
332379,6.8,2421,Min pappa Marianne,2020,"Comedy,Drama"
332416,8.4,47492,Kaithi,2019,"Action,Crime,Thriller"
332419,7.0,5163,Herself,2020,Drama


In [9]:
result.to_csv("raw_imdb.csv", index=False)