# Imports

In [1]:
import pandas as pd


# Data

In [2]:
links = pd.read_csv('../data/ml-25m/links.csv',
                    index_col='movieId', dtype={'imdbId': str, 'tmdbId': str, 'movieId': str})

movies25m = pd.read_csv('../data/ml-25m/movies.csv',
                        index_col='movieId', dtype={'movieid': str, 'title': str, 'genres': str})\
    .join(links)

movies1m = pd.read_csv('../data/ml-1m/movies.dat', sep='::',
                       engine='python',
                       encoding='latin-1',
                       names=['movieId', 'title', 'genres'],
                       index_col='movieId',
                       dtype={'movieId': str, 'title': str, 'genres': str})\
    .join(movies25m, lsuffix='_1m', rsuffix='_25m')


# Cleanup

How many movies in `moviesm1m` have no id?

In [3]:
no_title_id_idx = movies1m["imdbId"].isna()
noid_movies1m = movies1m[no_title_id_idx]
noid_movies1m.shape


(34, 6)

Just ignore those moviews with no ids:

In [4]:
movies1m = movies1m[~no_title_id_idx]
movies1m.shape


(3849, 6)

Check if there are movies with no imdb id:

In [5]:
# find NaN in imbdId in movies1m
movies1m["imdbId"].isna().sum()


0

Are the movie titles in 1m the same as 25m?

In [6]:
# show rows where title_1m != title_25m
# show only the title columns
# assign it to diff_titles
diff_titles = movies1m[movies1m["title_1m"] !=
                       movies1m["title_25m"]][["title_1m", "title_25m"]]
diff_titles.shape


(516, 2)

In [7]:
# show random 100 random samples of diff_titles
diff_titles.sample(100)


Unnamed: 0_level_0,title_1m,title_25m
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2938,Man Facing Southeast (Hombre Mirando al Sudest...,Man Facing Southeast (1986)
573,"Ciao, Professore! (Io speriamo che me la cavo ...","Ciao, Professore! (Io speriamo che me la cavo)..."
2869,"Separation, The (La Séparation) (1994)","Separation, The (Séparation, La) (1994)"
2309,"Inheritors, The (Die Siebtelbauern) (1998)","Inheritors, The (Siebtelbauern, Die) (1998)"
2985,Robocop (1987),RoboCop (1987)
...,...,...
773,Touki Bouki (Journey of the Hyena) (1973),Touki Bouki (1973)
1340,Bride of Frankenstein (1935),"Bride of Frankenstein, The (Bride of Frankenst..."
1807,"Cool Dry Place, A (1998)","Cool, Dry Place, A (1998)"
1177,Enchanted April (1991),Enchanted April (1992)


Looks like there are just minor differences in the title and/or year so we keep all of them.

Drop unneded columns:

In [8]:
# drop columns with _25m suffix and tmdbId
movies1m = movies1m.drop(
    columns=[col for col in movies1m.columns if col.endswith("_25m")])
movies1m = movies1m.drop(columns=["tmdbId"])

# rename columns with _1m suffix
movies1m = movies1m.rename(
    columns={col: col[:-3] for col in movies1m.columns if col.endswith("_1m")})

movies1m


Unnamed: 0_level_0,title,genres,imdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Animation|Children's|Comedy,0114709
2,Jumanji (1995),Adventure|Children's|Fantasy,0113497
3,Grumpier Old Men (1995),Comedy|Romance,0113228
4,Waiting to Exhale (1995),Comedy|Drama,0114885
5,Father of the Bride Part II (1995),Comedy,0113041
...,...,...,...
3948,Meet the Parents (2000),Comedy,0212338
3949,Requiem for a Dream (2000),Drama,0180093
3950,Tigerland (2000),Drama,0170691
3951,Two Family House (2000),Drama,0202641


Add URL to look up the moview in imbd:

In [9]:
# create a new column called imbd_url
# set imdb_url to https://www.imdb.com/title/tt + the value of imdbId + /plotsummary
movies1m["imdb_url"] = "https://www.imdb.com/title/tt" + \
    movies1m["imdbId"] + "/plotsummary"
movies1m


Unnamed: 0_level_0,title,genres,imdbId,imdb_url
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Animation|Children's|Comedy,0114709,https://www.imdb.com/title/tt0114709/plotsummary
2,Jumanji (1995),Adventure|Children's|Fantasy,0113497,https://www.imdb.com/title/tt0113497/plotsummary
3,Grumpier Old Men (1995),Comedy|Romance,0113228,https://www.imdb.com/title/tt0113228/plotsummary
4,Waiting to Exhale (1995),Comedy|Drama,0114885,https://www.imdb.com/title/tt0114885/plotsummary
5,Father of the Bride Part II (1995),Comedy,0113041,https://www.imdb.com/title/tt0113041/plotsummary
...,...,...,...,...
3948,Meet the Parents (2000),Comedy,0212338,https://www.imdb.com/title/tt0212338/plotsummary
3949,Requiem for a Dream (2000),Drama,0180093,https://www.imdb.com/title/tt0180093/plotsummary
3950,Tigerland (2000),Drama,0170691,https://www.imdb.com/title/tt0170691/plotsummary
3951,Two Family House (2000),Drama,0202641,https://www.imdb.com/title/tt0202641/plotsummary


Make the genres into a list:

In [10]:
# split genres column into a list of genres
movies1m["genres"] = movies1m["genres"].str.split("|")
movies1m


Unnamed: 0_level_0,title,genres,imdbId,imdb_url
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),"[Animation, Children's, Comedy]",0114709,https://www.imdb.com/title/tt0114709/plotsummary
2,Jumanji (1995),"[Adventure, Children's, Fantasy]",0113497,https://www.imdb.com/title/tt0113497/plotsummary
3,Grumpier Old Men (1995),"[Comedy, Romance]",0113228,https://www.imdb.com/title/tt0113228/plotsummary
4,Waiting to Exhale (1995),"[Comedy, Drama]",0114885,https://www.imdb.com/title/tt0114885/plotsummary
5,Father of the Bride Part II (1995),[Comedy],0113041,https://www.imdb.com/title/tt0113041/plotsummary
...,...,...,...,...
3948,Meet the Parents (2000),[Comedy],0212338,https://www.imdb.com/title/tt0212338/plotsummary
3949,Requiem for a Dream (2000),[Drama],0180093,https://www.imdb.com/title/tt0180093/plotsummary
3950,Tigerland (2000),[Drama],0170691,https://www.imdb.com/title/tt0170691/plotsummary
3951,Two Family House (2000),[Drama],0202641,https://www.imdb.com/title/tt0202641/plotsummary


In [11]:
# extract imdb_url as a list
imdb_urls = movies1m["imdb_url"].tolist()
# save imdb_urls to a file
with open("../data/imdb_urls.txt", "w") as f:
    f.write("\n".join(imdb_urls))


Now you can run the `scrape-movie-medata` target!

Once that's done:

In [12]:
# read the movie_metadata.jsonl file into a dataframe named movies_metadata
movies_metadata = pd.read_json("../data/movie_metadata.jsonl", lines=True)
# rename source_url to imdb_url
movies_metadata = movies_metadata.rename(columns={"source_url": "imdb_url"})
# drop title and id columns
movies_metadata = movies_metadata.drop(columns=["title", "id"])
# if imdb_url has https://m then replace m with wwww
movies_metadata["imdb_url"] = movies_metadata["imdb_url"].str.replace(
    "https://m", "https://www")
movies_metadata.head()


Unnamed: 0,plot,summary,poster_url,imdb_url
0,When two kids find and play a magical board ga...,"Jumanji, one of the most unique--and dangerous...",https://m.media-amazon.com/images/M/MV5BZTk2Zm...,https://www.imdb.com/title/tt0113497/plotsummary
1,John and Max resolve to save their beloved bai...,Things don't seem to change much in Wabasha Co...,https://m.media-amazon.com/images/M/MV5BMjQxM2...,https://www.imdb.com/title/tt0113228/plotsummary
2,George Banks must deal not only with his daugh...,"In this sequel to ""Father of the Bride"", Georg...",https://m.media-amazon.com/images/M/MV5BOTEyNz...,https://www.imdb.com/title/tt0113041/plotsummary
3,A group of high-end professional thieves start...,Hunters and their prey--Neil and his professio...,https://m.media-amazon.com/images/M/MV5BYjZjNT...,https://www.imdb.com/title/tt0113277/plotsummary
4,An ugly duckling having undergone a remarkable...,"While she was growing up, Sabrina Fairchild sp...",https://m.media-amazon.com/images/M/MV5BYjQ5Zj...,https://www.imdb.com/title/tt0114319/plotsummary


In [13]:
# left merge movies1m and movies_metadata on imdb_url
movies1m = movies1m.merge(movies_metadata, on="imdb_url", how="left")
movies1m.head()


Unnamed: 0,title,genres,imdbId,imdb_url,plot,summary,poster_url
0,Toy Story (1995),"[Animation, Children's, Comedy]",114709,https://www.imdb.com/title/tt0114709/plotsummary,A cowboy doll is profoundly threatened and jea...,A little boy named Andy loves to be in his roo...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,Jumanji (1995),"[Adventure, Children's, Fantasy]",113497,https://www.imdb.com/title/tt0113497/plotsummary,When two kids find and play a magical board ga...,"Jumanji, one of the most unique--and dangerous...",https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,Grumpier Old Men (1995),"[Comedy, Romance]",113228,https://www.imdb.com/title/tt0113228/plotsummary,John and Max resolve to save their beloved bai...,Things don't seem to change much in Wabasha Co...,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,Waiting to Exhale (1995),"[Comedy, Drama]",114885,https://www.imdb.com/title/tt0114885/plotsummary,"Based on Terry McMillan's novel, this film fol...",This story based on the best selling novel by ...,https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,Father of the Bride Part II (1995),[Comedy],113041,https://www.imdb.com/title/tt0113041/plotsummary,George Banks must deal not only with his daugh...,"In this sequel to ""Father of the Bride"", Georg...",https://m.media-amazon.com/images/M/MV5BOTEyNz...


Deal with movies without a plot:

In [20]:
# create a df containing movies without a plot
movies_without_plot = movies1m[movies1m["plot"].isna()]
movies_without_plot.shape


(26, 7)

In [21]:
# display the movies without a plot
movies_without_plot


Unnamed: 0,title,genres,imdbId,imdb_url,plot,summary,poster_url
634,Happy Weekend (1996),[Comedy],116485,https://www.imdb.com/title/tt0116485/plotsummary,,,
705,Wallace & Gromit: The Best of Aardman Animatio...,[Animation],118114,https://www.imdb.com/title/tt0118114/plotsummary,,,
715,"Low Life, The (1994)",[Drama],125877,https://www.imdb.com/title/tt0125877/plotsummary,,,
724,Honigmond (1996),[Comedy],116559,https://www.imdb.com/title/tt0116559/plotsummary,,,
736,Vermont Is For Lovers (1992),"[Comedy, Romance]",105737,https://www.imdb.com/title/tt0105737/plotsummary,,,
753,Marlene Dietrich: Shadow and Light (1996),[Documentary],116992,https://www.imdb.com/title/tt0116992/plotsummary,,,
754,Costa Brava (1946),[Drama],38426,https://www.imdb.com/title/tt0038426/plotsummary,,,
775,"Last Klezmer: Leopold Kozlowski, His Life and ...",[Documentary],113610,https://www.imdb.com/title/tt0113610/plotsummary,,,
1080,Loser (1991),[Comedy],102336,https://www.imdb.com/title/tt0102336/plotsummary,,,
1114,Get Over It (1996),[Drama],116403,https://www.imdb.com/title/tt0116403/plotsummary,,,


In [23]:
# since only a small number of movies have no plot, remove all movies without a plot from movies1m
movies1m = movies1m[~movies1m["plot"].isna()]
# save movies1m to a parquet file named movies_postprocessed.parquet
movies1m.to_parquet("../data/movies_postprocessed.parquet")
