In [1]:
import numpy as np
import pandas as pd
import spacy
import nltk, re
from nltk.corpus import stopwords
nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /home/ideis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df_ratings = pd.read_csv('data/movies/ratings.csv')
df_movies = pd.read_csv('data/movies/movies.csv')
df_links = pd.read_csv('data/movies/links.csv')

In [3]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

def clean_text(text, remove_stopwords=True):
    '''Clean the text, with the option to remove stopwords'''
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        tokens = [w for w in tokens if not w in stops]
    
    text = " ".join(tokens)
    return tokens

def process_title(title): 
    # strip away numbers and parenthesis
    title = title.replace('(','').replace(')','')
    title = re.sub(r'\d+','',title)
    # strip away "part" word
    title = re.sub(r'[Pp]art','',title)
    #strip II and III and IV
    title = title.replace('II','').replace('III','').replace('IV','')
    return title

df_movies['title'] = df_movies['title'].apply(process_title) 

In [4]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'textcat'])

In [5]:
def get_vector(doc):
    return doc.vector

df_movies['docs'] = df_movies['title'].apply(nlp)
df_movies['doc2vec'] = df_movies['docs'].apply(get_vector)
df_movies.drop(['docs'], axis=1, inplace=True)
df_movies.to_csv('data/movies/movies_titles_with_doc2vec.csv')

In [6]:
# Create links to imdb website
def make_urls(df_links):
    df_links['pagePath'] = 'https://www.imdb.com/title/tt00' + df_links['imdbId'].astype(str)
    df_links.drop(['imdbId'],axis=1,inplace=True)
    df_links.drop(['tmdbId'],axis=1,inplace=True)
    return df_links

df_links = make_urls(df_links)

In [9]:
# Merge by userId all csv files
def merge_df(df_ratings, df_links, df_movies):
    df_result = pd.merge(df_ratings, df_links, on='movieId', how='left')
    df_result = pd.merge(df_result, df_movies, on='movieId', how='left')
    df_result['visitStartTime'] = df_result['timestamp']
    df_result.set_index(['userId', 'movieId'], inplace=True)
    df_result = df_result[['visitStartTime', 'pagePath', 'title', 'genres', 'rating', 'doc2vec']]
    return df_result

df_result = merge_df(df_ratings, df_links, df_movies)
df_result.to_csv('data/movies/movies_final_dataset.csv')

In [10]:
display(df_result.info())
display(df_result.head(20))
display(df_result.tail(20))

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 100004 entries, (1, 31) to (671, 6565)
Data columns (total 6 columns):
visitStartTime    100004 non-null int64
pagePath          100004 non-null object
title             100004 non-null object
genres            100004 non-null object
rating            100004 non-null float64
doc2vec           100004 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 5.0+ MB


None

Unnamed: 0_level_0,Unnamed: 1_level_0,visitStartTime,pagePath,title,genres,rating,doc2vec
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,31,1260759144,https://www.imdb.com/title/tt00112792,Dangerous Minds,Drama,2.5,"[-0.18981, -0.0741665, 0.076044, 0.04386, -0.1..."
1,1029,1260759179,https://www.imdb.com/title/tt0033563,Dumbo,Animation|Children|Drama|Musical,3.0,"[0.49697, -0.75673, -0.16233, -0.38816, -0.041..."
1,1061,1260759182,https://www.imdb.com/title/tt00117665,Sleepers,Thriller,3.0,"[0.49327, -0.066034, -0.82823, -0.12337, 0.104..."
1,1129,1260759185,https://www.imdb.com/title/tt0082340,Escape from New York,Action|Adventure|Sci-Fi|Thriller,2.0,"[0.204776, -0.0123488, -0.132108, -0.156406, 0..."
1,1172,1260759205,https://www.imdb.com/title/tt0095765,Cinema Paradiso Nuovo cinema Paradiso,Drama,4.0,"[0.186631, -0.162396, 0.468606, -0.111806, 0.8..."
1,1263,1260759151,https://www.imdb.com/title/tt0077416,"Deer Hunter, The",Drama|War,2.0,"[-0.17494, 0.130232, -0.220345, -0.100359, 0.0..."
1,1287,1260759187,https://www.imdb.com/title/tt0052618,Ben-Hur,Action|Adventure|Drama,2.0,"[-0.0948633, -0.0564933, 0.0937483, -0.224213,..."
1,1293,1260759148,https://www.imdb.com/title/tt0083987,Gandhi,Drama,2.0,"[0.18674, -0.12497, 0.47532, 0.026546, 0.25526..."
1,1339,1260759125,https://www.imdb.com/title/tt00103874,Dracula Bram Stoker's Dracula,Fantasy|Horror|Romance|Thriller,3.5,"[0.23877, 0.0841364, 0.141052, -0.440234, 0.13..."
1,1343,1260759131,https://www.imdb.com/title/tt00101540,Cape Fear,Thriller,2.0,"[-0.221722, -0.502961, 0.0424016, -0.157778, -..."


Unnamed: 0_level_0,Unnamed: 1_level_0,visitStartTime,pagePath,title,genres,rating,doc2vec
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
671,5010,1066793004,https://www.imdb.com/title/tt00265086,Black Hawk Down,Action|Drama|War,2.0,"[-0.123288, -0.216212, -0.0149837, -0.0303293,..."
671,5218,1065111990,https://www.imdb.com/title/tt00268380,Ice Age,Adventure|Animation|Children|Comedy,2.0,"[0.10703, 0.212375, 0.184704, -0.0523715, 0.33..."
671,5299,1065112004,https://www.imdb.com/title/tt00259446,My Big Fat Greek Wedding,Comedy|Romance,3.0,"[-0.243652, 0.119156, -0.110742, 0.183642, 0.2..."
671,5349,1065111863,https://www.imdb.com/title/tt00145487,Spider-Man,Action|Adventure|Sci-Fi|Thriller,4.0,"[-0.229203, 0.231103, -0.204692, -0.0497633, -..."
671,5377,1064245557,https://www.imdb.com/title/tt00276751,About a Boy,Comedy|Drama|Romance,4.0,"[-0.153434, 0.275248, -0.301813, 0.221511, 0.2..."
671,5445,1064891627,https://www.imdb.com/title/tt00181689,Minority Report,Action|Crime|Mystery|Sci-Fi|Thriller,4.5,"[-0.664475, 0.149356, -0.0536514, 0.320935, -0..."
671,5464,1064891549,https://www.imdb.com/title/tt00257044,Road to Perdition,Crime|Drama,3.0,"[0.542783, 0.14813, -0.0641, -0.08356, 0.34003..."
671,5669,1063502711,https://www.imdb.com/title/tt00310793,Bowling for Columbine,Documentary,4.0,"[-0.04742, 0.311217, 0.125351, -0.450989, 0.34..."
671,5816,1065111963,https://www.imdb.com/title/tt00295297,Harry Potter and the Chamber of Secrets,Adventure|Fantasy,4.0,"[0.109797, 0.116797, -0.0787361, -0.111342, -0..."
671,5902,1064245507,https://www.imdb.com/title/tt00268126,Adaptation,Comedy|Drama|Romance,3.5,"[0.17842, -0.10339, -0.093047, -0.45322, -0.90..."
