In [None]:
# importing modules I will use
import pandas as pd
import json
from collections import defaultdict, Counter
import ast
import matplotlib.pyplot as plt
import requests
import json
from flask import Flask, request
from jinja2 import Environment
from urllib.request import Request, urlopen
import os
import tqdm
from sklearn.model_selection import train_test_split
from PIL import Image
import pickle
import numpy as np
# API key I use to download the movie posters from TMDB
api_key = '39329068bc1de1536d231b6b49c9ff50'
import re
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.preprocessing import MultiLabelBinarizer
from gensim.models import KeyedVectors
import string
from nltk.tokenize import RegexpTokenizer
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
device = torch.device('cuda:1') 

## Mostly Preprocessing

In [None]:
# loading the dataset.

df=pd.read_csv('../data/movies_metadata.csv')


In [None]:
# we want to only keep the columns that are relevant for our task and drop the rest.

df = df[['original_title','overview','id','genres','production_companies']]

In [None]:
# list of genres that do not really make sense and appear only once so I decided to remove them.

weird_genres = ['TV Movie', 'Carousel Productions', 'Vision View Entertainment',
 'Telescene Film Group Productions', 'Aniplex', 'GoHands',
 'BROSTA TV', 'Mardock Scramble Production Committee', 'Sentai Filmworks',
 'Odyssey Media', 'Pulser Productions', 'Rogue State', 'The Cartel']

In [None]:
# This function cleans up the column containing the movie genres, which has a really weird formatting.

def clean_genres(df,column_name,wanted_value):
    list_of_genres = []
    for i,row in df[column_name].iteritems():
        row = ast.literal_eval(row)
        row_list = []
        for dic in row:
            if dic[wanted_value] not in weird_genres:
                row_list.append(dic[wanted_value])
        list_of_genres.append(row_list)
    return list_of_genres
        
df['genres'] = clean_genres(df,'genres','name')


In [None]:
def clean_pro(df1):
    list_of_genres = []
    for i,row in df1['production_companies'].iteritems():
        #print(type(row))
        try:
            row = ast.literal_eval(row)
        #print(type(row))
            row_list = []
            for dic in row:
                if dic['name'] not in weird_genres:
                    row_list.append(dic['name'])
        except Exception as e:
            print('[ERROR]', str(e))
            pass
            #list_of_genres.append("Error")
        list_of_genres.append(row_list)
    return list_of_genres

companies_list = clean_pro(df)

In [None]:
df['production_companies'] = companies_list

In [None]:
df=df[df['production_companies'].map(lambda d: len(d)) > 0]
df=df[df['genres'].map(lambda d: len(d)) > 0]
df.reset_index(drop=True)
df = df.dropna()

In [None]:
# removing all rows of non English movies

def isEnglish(s):
    return s.isascii()
def remove_noneng_titles(df):
    noneng = []
    for title in df['original_title'].tolist():
        if not isEnglish(title):
            noneng.append(title)
    df = df[~df['original_title'].isin(noneng)]
    return df
df = remove_noneng_titles(df)

In [None]:
# This function plots the distribution of movie genres in the dataset. 

def plot_distribution(df):
    genres = df.genres.values
    flattened_genres = [item for sublist in genres for item in sublist]
    count_of_genres = Counter(flattened_genres)    
    fig, ax = plt.subplots()
    plt.bar(count_of_genres.keys(), count_of_genres.values())
    plt.title("Genre Distribution")
    plt.ylabel('Genre Frequency')
    plt.xlabel('Genres')
    # Rotate 45 degrees 
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha="right" )
    plt.gcf().subplots_adjust(bottom=0.15)
    plt.show()
    
plot_distribution(df)   

In [None]:
# This function uses the TMDB API to retrieve a url of every movie poster and then it downloads it locally.
def get_data(movie_id):
    # I use try - except to avoid my function crashing from potential errors (e.g. in the case that there is no poster in the json dictionary)
    try:
        url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
     
        request = Request(url)
        response = urlopen(request)
        data = response.read()
        poster_path = json.loads(data)['poster_path']        
        init_url = 'https://image.tmdb.org/t/p/w500'
        image_url = init_url + poster_path     
        r = requests.get(image_url)
        name = poster_path.replace('/', '_') 
        filename = f"poster{name}"
        pa = os.path.join('/home/gusmavko@GU.GU.SE/aics-project/data/images', filename)        
        # preferred for "binary" filetypes, like poster images
        with open(pa,'wb') as w:
            w.write(r.content)       
        return pa
    except Exception:
        return 'Error'

In [None]:
# This function iterates through the dataframe id column and downloads the posters for all movies that have an available poster url.
def add_poster(df): 
    poster_paths = []
    for movie_id in tqdm.tqdm(df['id'].tolist()):
        try:
            poster_paths.append(get_data(movie_id))
            print(movie_id)
        except Exception as e:
            print('[ERROR]', str(e))
            poster_paths.append("API Error")
    
    return poster_paths
# This takes so long to run (last time I ran it was in December probably, it took about 4 hours i think), I have downloaded them locally so I can perhaps just upload them on drive if somebody does not want to run this.

list_of_posters = add_poster(df)


In [None]:
# I also dropped all rows that did not have a poster or movies that caused an error. No need to run if we already have a df with the poster_path column. I will upload the df I used to train t
    
def finalize_df(df):
    # There is something wrong with this image, it can't be opened so I decided to just drop it.
    unidentifiable_image_file = '/home/gusmavko@GU.GU.SE/aics-project/data/images/poster_b15FrCKeWVH62Sn3o69ZXZi3bBi.jpg'
    df['poster_paths'] = list_of_posters  
    df = df[df.poster_paths != "API Error"]
    df = df[df.poster_paths != "Error"]
    df = df[df.poster_paths != unidentifiable_image_file]
    return df
df = finalize_df(df)

In [None]:
# save to csv file to be able to load it easily later
df.to_csv("/home/gusmavko@GU.GU.SE/aics-project/data/dataset.csv",index=False)

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state = 42)



In [None]:
#val.to_csv("/home/gusmavko@GU.GU.SE/aics-project/data/val_half.csv",index=False)
#test.to_csv("/home/gusmavko@GU.GU.SE/aics-project/data/test_half.csv",index=False)
#train.to_csv("/home/gusmavko@GU.GU.SE/aics-project/data/train_half.csv",index=False)

#val_df = pd.read_csv('/home/gusmavko@GU.GU.SE/aics-project/data/val_half.csv')
#test_df = pd.read_csv('/home/gusmavko@GU.GU.SE/aics-project/data/test_half.csv')
#train_df = pd.read_csv('/home/gusmavko@GU.GU.SE/aics-project/data/train_half.csv')
#
#train_df.shape

In [None]:
# This function makes a tensor out of an image array and saves it to the gpu.
def create_img_tensor(img_path, dimensions):
    try:
        img=Image.open(img_path)
        res_img = img.resize(dimensions).convert('RGB')
        img_array = np.array(res_img)
        img_tensor = torch.tensor(img_array)#.to(device) # commented it out the saving to the gpu server because it caused a lot of memory errors when experimenting with the model
    
        return img_tensor
    except Exception as e:
        print(str(e))
        return None


In [None]:
# This function iterates through the dataframe and creates tensor representations of all images
def get_tensors(df,dimensions): # dimensions should be a tuple containing (width, height) e.g. (100,100)
    
    list_imgs = [] 
    for i, row in df.iterrows():
        img = create_img_tensor(row['poster_paths'], dimensions)
        if img is not None:
            list_imgs.append(img)
        else:
            df = df.drop(i)            
    df_tensors = torch.stack(list_imgs)
    return df, df_tensors

In [None]:
# get tensor of tensors for validation set. Also, returning a new df since some images could not be transformed to tensors. These movies are dropped from the df.
val_df, val_tensors = get_tensors(val_df,(100,100))

In [None]:
torch.save(val_tensors, '/home/gusmavko@GU.GU.SE/aics-project/data/val_half_tensors.pt')

In [None]:
# get tensor of tensors for test set. Also, returning a new df since some images could not be transformed to tensors. These movies are dropped from the df.
test_df, test_tensors = get_tensors(test_df,(100,100))

In [None]:
#with open('/home/gusmavko@GU.GU.SE/aics-project/data/test_tensors','wb') as f: pickle.dump(test_tensors, f)
torch.save(test_tensors,'/home/gusmavko@GU.GU.SE/aics-project/data/test_half_tensors.pt')

In [None]:
# get tensor of tensors for train set. Also, returning a new df since some images could not be transformed to tensors. These movies are dropped from the df.
train_df, train_tensors = get_tensors(train_df,(100,100))


In [None]:
#with open('/home/gusmavko@GU.GU.SE/aics-project/data/train_tensors','wb') as f: pickle.dump(train_tensors, f)
torch.save(train_tensors,'/home/gusmavko@GU.GU.SE/aics-project/data/train_half_tensors.pt')