# Imports

In [1]:
import pandas as pd


# Data

In [2]:
links = pd.read_csv('../data/ml-25m/links.csv',
                    index_col='movieId', dtype={'imdbId': str, 'tmdbId': str, 'movieId': str})

movies25m = pd.read_csv('../data/ml-25m/movies.csv',
                        index_col='movieId', dtype={'movieid': str, 'title': str, 'genres': str})\
    .join(links)

movies1m = pd.read_csv('../data/ml-1m/movies.dat', sep='::',
                       engine='python',
                       encoding='latin-1',
                       names=['movieId', 'title', 'genres'],
                       index_col='movieId',
                       dtype={'movieId': str, 'title': str, 'genres': str})\
    .join(movies25m, lsuffix='_1m', rsuffix='_25m')


# Cleanup

How many movies in `moviesm1m` have no id?

In [3]:
no_title_id_idx = movies1m["imdbId"].isna()
noid_movies1m = movies1m[no_title_id_idx]
noid_movies1m.shape


(34, 6)

Just ignore those moviews with no ids:

In [4]:
movies1m = movies1m[~no_title_id_idx]
movies1m.shape


(3849, 6)

Check if there are movies with no imdb id:

In [5]:
# find NaN in imbdId in movies1m
movies1m["imdbId"].isna().sum()

0

Are the movie titles in 1m the same as 25m?

In [6]:
# show rows where title_1m != title_25m
# show only the title columns
# assign it to diff_titles
diff_titles = movies1m[movies1m["title_1m"] != movies1m["title_25m"]][["title_1m", "title_25m"]]
diff_titles.shape

(516, 2)

In [7]:
# show random 100 random samples of diff_titles
diff_titles.sample(100)

Unnamed: 0_level_0,title_1m,title_25m
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3761,"Blood In, Blood Out (a.k.a. Bound by Honor) (1...","Blood In, Blood Out (1993)"
245,"Glass Shield, The (1994)",The Glass Shield (1994)
889,1-900 (1994),1-900 (06) (1994)
2189,I Married A Strange Person (1997),I Married A Strange Person! (1997)
2564,"Empty Mirror, The (1999)","Empty Mirror, The (1996)"
...,...,...
652,"301, 302 (1995)","301, 302 (301/302) (1995)"
1209,Once Upon a Time in the West (1969),Once Upon a Time in the West (C'era una volta ...
284,New York Cop (1996),New York Cop (Nyû Yôku no koppu) (1993)
649,Cold Fever (Á köldum klaka) (1994),Cold Fever (Á köldum klaka) (1995)


Looks like there are just minor differences in the title and/or year so we keep all of them.

Drop unneded columns:

In [8]:
# drop columns with _25m suffix and tmdbId
movies1m = movies1m.drop(columns=[col for col in movies1m.columns if col.endswith("_25m")])
movies1m = movies1m.drop(columns=["tmdbId"])

# rename columns with _1m suffix
movies1m = movies1m.rename(columns={col: col[:-3] for col in movies1m.columns if col.endswith("_1m")})

movies1m

Unnamed: 0_level_0,title,genres,imdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Animation|Children's|Comedy,0114709
2,Jumanji (1995),Adventure|Children's|Fantasy,0113497
3,Grumpier Old Men (1995),Comedy|Romance,0113228
4,Waiting to Exhale (1995),Comedy|Drama,0114885
5,Father of the Bride Part II (1995),Comedy,0113041
...,...,...,...
3948,Meet the Parents (2000),Comedy,0212338
3949,Requiem for a Dream (2000),Drama,0180093
3950,Tigerland (2000),Drama,0170691
3951,Two Family House (2000),Drama,0202641


Add URL to look up the moview in imbd:

In [9]:
# create a new column called imbd_url
# set imdb_url to https://www.imdb.com/title/tt + the value of imdbId + /plotsummary
movies1m["imdb_url"] = "https://www.imdb.com/title/tt" + movies1m["imdbId"] + "/plotsummary"
movies1m

Unnamed: 0_level_0,title,genres,imdbId,imdb_url
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Animation|Children's|Comedy,0114709,https://www.imdb.com/title/tt0114709/plotsummary
2,Jumanji (1995),Adventure|Children's|Fantasy,0113497,https://www.imdb.com/title/tt0113497/plotsummary
3,Grumpier Old Men (1995),Comedy|Romance,0113228,https://www.imdb.com/title/tt0113228/plotsummary
4,Waiting to Exhale (1995),Comedy|Drama,0114885,https://www.imdb.com/title/tt0114885/plotsummary
5,Father of the Bride Part II (1995),Comedy,0113041,https://www.imdb.com/title/tt0113041/plotsummary
...,...,...,...,...
3948,Meet the Parents (2000),Comedy,0212338,https://www.imdb.com/title/tt0212338/plotsummary
3949,Requiem for a Dream (2000),Drama,0180093,https://www.imdb.com/title/tt0180093/plotsummary
3950,Tigerland (2000),Drama,0170691,https://www.imdb.com/title/tt0170691/plotsummary
3951,Two Family House (2000),Drama,0202641,https://www.imdb.com/title/tt0202641/plotsummary


Make the genres into a list:

In [10]:
# split genres column into a list of genres
movies1m["genres"] = movies1m["genres"].str.split("|")
movies1m

Unnamed: 0_level_0,title,genres,imdbId,imdb_url
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),"[Animation, Children's, Comedy]",0114709,https://www.imdb.com/title/tt0114709/plotsummary
2,Jumanji (1995),"[Adventure, Children's, Fantasy]",0113497,https://www.imdb.com/title/tt0113497/plotsummary
3,Grumpier Old Men (1995),"[Comedy, Romance]",0113228,https://www.imdb.com/title/tt0113228/plotsummary
4,Waiting to Exhale (1995),"[Comedy, Drama]",0114885,https://www.imdb.com/title/tt0114885/plotsummary
5,Father of the Bride Part II (1995),[Comedy],0113041,https://www.imdb.com/title/tt0113041/plotsummary
...,...,...,...,...
3948,Meet the Parents (2000),[Comedy],0212338,https://www.imdb.com/title/tt0212338/plotsummary
3949,Requiem for a Dream (2000),[Drama],0180093,https://www.imdb.com/title/tt0180093/plotsummary
3950,Tigerland (2000),[Drama],0170691,https://www.imdb.com/title/tt0170691/plotsummary
3951,Two Family House (2000),[Drama],0202641,https://www.imdb.com/title/tt0202641/plotsummary


In [11]:
# extract imdb_url as a list
imdb_urls = movies1m["imdb_url"].tolist()

# save imdb_urls to a file
with open("../data/imdb_urls.txt", "w") as f:
    f.write("\n".join(imdb_urls))


Now you can run the `scrape-movie-medata` target!