In [15]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
import numpy as np
import glob
import os

### Load in Data for Content Based Filtering

In [16]:
imdb_data = {}
imdb_folder = './Datasets/imdb data'

# Get all TSV files in the folder
tsv_files = glob.glob(os.path.join(imdb_folder, '*.tsv'))

# Load each TSV file into a dictionary of dataframes
for file_path in tsv_files:
    file_name = os.path.basename(file_path).replace('.tsv', '')
    imdb_data[file_name] = pd.read_csv(file_path, sep='\t', low_memory=False)
    print(f"Loaded {file_name}: {imdb_data[file_name].shape}")

Loaded title.ratings: (1535537, 3)
Loaded title.principals: (90963124, 6)
Loaded title.akas: (51247422, 8)
Loaded title.basics: (11461894, 9)
Loaded title.episode: (8813357, 4)
Loaded title.crew: (11461894, 3)


In [17]:
print(imdb_data['title.akas'].head(10))


     titleId  ordering                      title region language  \
0  tt0000001         1                 Carmencita     \N       \N   
1  tt0000001         2                 Carmencita     DE       \N   
2  tt0000001         3                 Carmencita     US       \N   
3  tt0000001         4  Carmencita - spanyol tánc     HU       \N   
4  tt0000001         5                 Καρμενσίτα     GR       \N   
5  tt0000001         6                 Карменсита     RU       \N   
6  tt0000001         7                 Карменсіта     UA       \N   
7  tt0000001         8                    カルメンチータ     JP       ja   
8  tt0000002         1     Le clown et ses chiens     \N       \N   
9  tt0000002         2          A bohóc és kutyái     HU       \N   

         types     attributes  isOriginalTitle  
0     original             \N                1  
1           \N  literal title                0  
2  imdbDisplay             \N                0  
3  imdbDisplay             \N               

In [18]:
imdb_data['title.akas']["types"].unique()

array(['original', '\\N', 'imdbDisplay', 'alternative', 'festival', 'dvd',
       'working', 'tv', 'video', 'imdbDisplay\x02tv', 'alternative\x02tv',
       'imdbDisplay\x02working', 'imdbDisplay\x02festival',
       'working\x02tv', 'imdbDisplay\x02video', 'dvd\x02alternative',
       'tv\x02video', 'imdbDisplay\x02dvd', 'working\x02video',
       'working\x02festival', 'dvd\x02video', 'alternative\x02festival',
       'alternative\x02video', 'working\x02alternative'], dtype=object)

### Load Names of tsv files

In [19]:
len(imdb_data)

6

### Load in Collaborative Filtering Data

In [20]:

movies_df = pd.read_csv('./Datasets/ml-32m/links.csv')


In [21]:
print(movies_df.head())

   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


### Remove Movies without cast, crew, or ratings

In [22]:
copy_akas = imdb_data["title.akas"].copy()
keys = ["title.crew", "title.ratings", "title.principals"]
for key in keys:
    mask = copy_akas["titleId"].isin(imdb_data[key]["tconst"])
    copy_akas = copy_akas[mask]
len(copy_akas["titleId"])



5971241

### Take Intersection of Movies in Both Datasets

#### Remove Rows from MovieLens Dataframes

In [23]:
ratings = pd.read_csv('./Datasets/ml-32m/ratings.csv')
ratings = ratings[ratings["movieId"].isin(movies_df["movieId"])]
movies_df_cleaned = movies_df[movies_df["movieId"].isin(ratings["movieId"])]
print(len(movies_df_cleaned["movieId"].unique()))
len(ratings["movieId"].unique())

84432


84432

##### Turn string id into numeric id

In [24]:
copy_akas["titleIdNumeric"] = copy_akas["titleId"].str.replace("tt", "").astype(int)

In [25]:
movies_df_cleaned = movies_df_cleaned[movies_df_cleaned["imdbId"].isin(copy_akas["titleIdNumeric"])]
len(movies_df_cleaned["imdbId"].unique())

83955

In [26]:
copy_akas_cleaned = copy_akas[copy_akas["titleIdNumeric"].isin(movies_df_cleaned["imdbId"])]
len(copy_akas_cleaned["titleIdNumeric"].unique())

83955

In [27]:
ratings = ratings[ratings["movieId"].isin(movies_df_cleaned["movieId"])]
len(ratings["movieId"].unique())

83955

In [29]:
imdb_data.keys()

dict_keys(['title.ratings', 'title.principals', 'title.akas', 'title.basics', 'title.episode', 'title.crew'])

### Remove rows from other IMDB Dataframes

In [30]:
imdb_data_cleaned_copy = {}
for key in imdb_data.keys():
    if key != "title.episode" and key != "title.akas":
        imdb_data_cleaned_copy[key] = imdb_data[key].copy()  
        mask = imdb_data_cleaned_copy[key]["tconst"].isin(copy_akas_cleaned["titleId"])
        imdb_data_cleaned_copy[key] = imdb_data_cleaned_copy[key][mask]

### Write Out Cleaned Data to CSV Files

##### Write Out Movie Lens Data First

In [31]:
output_dir_movielens = './Datasets/ml-32m-cleaned'
os.makedirs(output_dir_movielens, exist_ok=True)
ratings.to_csv(f'{output_dir_movielens}/ratings.csv', index=False)
movies_df_cleaned.to_csv(f'{output_dir_movielens}/links.csv', index=False)


In [32]:
imdb_data_cleaned_copy.keys()

dict_keys(['title.ratings', 'title.principals', 'title.basics', 'title.crew'])

### Write Out IMDB Data

In [33]:
output_dir_imdb = './Datasets/imdb-data-cleaned'
os.makedirs(output_dir_imdb, exist_ok=True)
for key, df in imdb_data_cleaned_copy.items():
    # Replace the original .tsv extension with .csv
    filename = key + '.csv'
    output_path = os.path.join(output_dir_imdb, filename)
    df.to_csv(output_path, index=False)
    print(f"Saved {filename}")

Saved title.ratings.csv
Saved title.principals.csv
Saved title.basics.csv
Saved title.crew.csv


### Remove Any Moviews without 

### Matrix Factorization Approach

### Autoencoder Approach

In [None]:
### Have input be partial user ratings instead of items

### Factorization Machines