In [1]:
import pandas as pd
from pathlib import Path 

pd.set_option('display.max_columns', 30)

In [19]:
data_path = Path("../data/movies")

In [4]:
credits = pd.read_csv(data_path / "credits.csv")
keywords = pd.read_csv(data_path / "keywords.csv")
links_small = pd.read_csv(data_path / "links_small.csv")
links = pd.read_csv(data_path / "links.csv")
movies_metadata = pd.read_csv(data_path / "movies_metadata.csv")
ratings_small = pd.read_csv(data_path / "ratings_small.csv")
ratings = pd.read_csv(data_path / "ratings.csv")

  movies_metadata = pd.read_csv(data_path / "movies_metadata.csv")


In [13]:
def is_float(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

[x for x in movies_metadata.iloc[:, 10].astype(str) if not is_float(x)]

['Beware Of Frost Bites']

## Credits

credits.csv: Consists of Cast and Crew Information for all our movies. Available in the form of a stringified JSON Object.

In [14]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


## Keywords

keywords.csv: Contains the movie plot keywords for our MovieLens movies. Available in the form of a stringified JSON Object.

In [15]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


## Links

links.csv: The file that contains the TMDB and IMDB IDs of all the movies featured in the Full MovieLens dataset.

In [40]:
links.isna().sum()

movieId      0
imdbId       0
tmdbId     219
dtype: int64

In [41]:
links.drop(columns=['tmdbId'], inplace=True)
links = links.astype(int)

In [42]:
links.head()

Unnamed: 0,movieId,imdbId
0,1,114709
1,2,113497
2,3,113228
3,4,114885
4,5,113041


## Movies metadata

movies_metadata.csv: The main Movies Metadata file. Contains information on 45,000 movies featured in the Full MovieLens dataset. Features include posters, backdrops, budget, revenue, release dates, languages, production countries and companies.

In [54]:
movies_metadata.rename(columns={'imdb_id': 'imdbId'}, inplace=True)
movies_metadata.rename(columns={'id': 'movieId'}, inplace=True)

In [53]:
movies_metadata.iloc[0, :]

adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
movieId                                                                862
imdbId                                                           tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     

In [51]:
metadata_columns = [
    'movieId',
    'imdbId',
    'title',
]

metadata = movies_metadata[metadata_columns].copy()

In [52]:
metadata.dropna(inplace=True)
metadata.isna().sum()

movieId    0
imdbId     0
title      0
dtype: int64

In [55]:
metadata['imdbId'] = metadata['imdbId'].apply(lambda x: int(x.replace('tt', '')))
metadata['movieId'] = metadata['movieId'].astype(int)

In [56]:
metadata.head()

Unnamed: 0,movieId,imdbId,title
0,862,114709,Toy Story
1,8844,113497,Jumanji
2,15602,113228,Grumpier Old Men
3,31357,114885,Waiting to Exhale
4,11862,113041,Father of the Bride Part II


## Ratings

ratings_small.csv: The subset of 100,000 ratings from 700 users on 9,000 movies.

In [61]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 794.2 MB


In [62]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [23]:
len(ratings), len(ratings_small)

(26024289, 100004)

## Create dataset

In [72]:
dataset = metadata.merge(ratings, on=['movieId'], how='inner')

In [73]:
dataset

Unnamed: 0,movieId,imdbId,title,userId,rating,timestamp
0,862,114709,Toy Story,1923,3.0,858335006
1,862,114709,Toy Story,2103,5.0,946044912
2,862,114709,Toy Story,5380,1.0,878941641
3,862,114709,Toy Story,6177,4.0,859415226
4,862,114709,Toy Story,6525,4.0,857388995
...,...,...,...,...,...,...
11437585,111109,2028550,Century of Birthing,33940,2.5,1405878785
11437586,111109,2028550,Century of Birthing,172224,3.0,1399502972
11437587,111109,2028550,Century of Birthing,210792,3.0,1467090449
11437588,111109,2028550,Century of Birthing,225396,3.5,1399302912


In [74]:
dataset.isna().sum()

movieId      0
imdbId       0
title        0
userId       0
rating       0
timestamp    0
dtype: int64

In [21]:
out_path = data_path / 'training_datasets/'
out_path.mkdir()

dataset.to_csv(out_path / 'first.csv')