In [8]:
import numpy as np
import pandas as pd

import nltk, re
from nltk.corpus import stopwords
nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /home/ideis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df_ratings = pd.read_csv('data/movies/ratings.csv')
df_movies = pd.read_csv('data/movies/movies.csv')
df_links = pd.read_csv('data/movies/links.csv')

In [5]:
from gensim.models import KeyedVectors
ft_model = KeyedVectors.load_word2vec_format('models/wiki-news-300d-1M-subword.vec')

In [6]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

def clean_text(text, remove_stopwords=True):
    '''Clean the text, with the option to remove stopwords'''
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("russian"))
        tokens = [w for w in tokens if not w in stops]
    
#     text = " ".join(tokens)
    return tokens

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

df_movies = df_movies.drop(df_movies[df_movies['title'].apply(lambda t: not is_ascii(t))].index)

def process_title(title): 
    # strip away numbers and parenthesis
    title = title.replace('(','').replace(')','')
    title = re.sub(r'\d+','',title)
    # strip away "part" word
    title = re.sub(r'[Pp]art','',title)
    #strip II and III and IV
    title = title.replace('II','').replace('III','').replace('IV','')
    return title

df_movies['title'] = df_movies['title'].apply(process_title) 
#drop empty titles
df_movies = df_movies.drop(df_movies[df_movies['title'].str.strip() ==''].index)

In [9]:
vec_dim = 300

def create_average_vec(doc):
    average = np.zeros((vec_dim,), dtype='float32')
    num_words = 0.
    for word in doc:
        if word in ft_model:
            average = np.add(average, ft_model[word])
            num_words += 1.
    if num_words != 0.:
        average = np.divide(average, num_words)
    return average


def create_doc2vec(text):
    processed_text = clean_text(text)
    vec = create_average_vec(processed_text)
    return vec

df_movies['doc2vec'] = df_movies['title'].apply(create_doc2vec)
df_movies.to_csv('data/movies/movies_titles_with_fasttext.csv')

In [10]:
# Create links to imdb website
def make_urls(df_links):
    df_links['pagePath'] = 'https://www.imdb.com/title/tt00' + df_links['imdbId'].astype(str)
    df_links.drop(['imdbId'],axis=1,inplace=True)
    df_links.drop(['tmdbId'],axis=1,inplace=True)
    return df_links
df_links = make_urls(df_links)

In [11]:
# Merge by userId all csv files
def merge_df(df_ratings, df_links, df_movies):
    df_result = pd.merge(df_ratings, df_links, on='movieId', how='left')
    df_result = pd.merge(df_result, df_movies, on='movieId', how='left')
    df_result['visitStartTime'] = df_result['timestamp']
    df_result.set_index(['userId', 'movieId'], inplace=True)
    df_result = df_result[['visitStartTime', 'pagePath', 'title', 'genres', 'rating', 'doc2vec']]
    return df_result
df_result = merge_df(df_ratings, df_links, df_movies)
df_movies.to_csv('data/movies/movies_final_dataset.csv')

In [12]:
display(df_result.info())
display(df_result.head(20))
display(df_result.tail(20))

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 100004 entries, (1, 31) to (671, 6565)
Data columns (total 6 columns):
visitStartTime    100004 non-null int64
pagePath          100004 non-null object
title             98437 non-null object
genres            98437 non-null object
rating            100004 non-null float64
doc2vec           98437 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 5.0+ MB


None

Unnamed: 0_level_0,Unnamed: 1_level_0,visitStartTime,pagePath,title,genres,rating,doc2vec
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,31,1260759144,https://www.imdb.com/title/tt00112792,Dangerous Minds,Drama,2.5,"[-0.01235, -0.00235, -0.00675, 0.0034, -0.0457..."
1,1029,1260759179,https://www.imdb.com/title/tt0033563,Dumbo,Animation|Children|Drama|Musical,3.0,"[-0.0014, -0.0093, -0.0342, 0.0081, -0.0098, -..."
1,1061,1260759182,https://www.imdb.com/title/tt00117665,Sleepers,Thriller,3.0,"[0.0239, 0.0062, -0.0187, -0.0185, -0.0218, -0..."
1,1129,1260759185,https://www.imdb.com/title/tt0082340,Escape from New York,Action|Adventure|Sci-Fi|Thriller,2.0,"[0.0092, -0.001275, 0.01185, 0.002, 0.006425, ..."
1,1172,1260759205,https://www.imdb.com/title/tt0095765,Cinema Paradiso Nuovo cinema Paradiso,Drama,4.0,"[0.00364, -0.02482, 0.00342, 0.01654, 0.00184,..."
1,1263,1260759151,https://www.imdb.com/title/tt0077416,"Deer Hunter, The",Drama|War,2.0,"[-0.00696667, 0.00103333, 0.0166667, 0.0046333..."
1,1287,1260759187,https://www.imdb.com/title/tt0052618,Ben-Hur,Action|Adventure|Drama,2.0,"[-0.0548, -0.06875, -0.00815, 0.01355, -0.0053..."
1,1293,1260759148,https://www.imdb.com/title/tt0083987,Gandhi,Drama,2.0,"[0.0017, -0.0399, 0.0136, 0.0202, -0.0163, 0.0..."
1,1339,1260759125,https://www.imdb.com/title/tt00103874,Dracula Bram Stoker's Dracula,Fantasy|Horror|Romance|Thriller,3.5,"[0.007975, -0.02985, 0.0616, 0.026575, -0.0535..."
1,1343,1260759131,https://www.imdb.com/title/tt00101540,Cape Fear,Thriller,2.0,"[-0.02185, -0.0038, 0.0275, -0.01, -0.02465, -..."


Unnamed: 0_level_0,Unnamed: 1_level_0,visitStartTime,pagePath,title,genres,rating,doc2vec
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
671,5010,1066793004,https://www.imdb.com/title/tt00265086,Black Hawk Down,Action|Drama|War,2.0,"[0.013, -0.0096, 0.00566667, -0.006, 0.0003666..."
671,5218,1065111990,https://www.imdb.com/title/tt00268380,Ice Age,Adventure|Animation|Children|Comedy,2.0,"[-0.0155, 0.00955, -0.01785, -0.0088, -0.0031,..."
671,5299,1065112004,https://www.imdb.com/title/tt00259446,My Big Fat Greek Wedding,Comedy|Romance,3.0,"[-0.01932, -0.01156, -0.00708, -0.01224, -0.01..."
671,5349,1065111863,https://www.imdb.com/title/tt00145487,Spider-Man,Action|Adventure|Sci-Fi|Thriller,4.0,"[-0.0133, 0.00585, 0.0052, 0.00945, -0.03785, ..."
671,5377,1064245557,https://www.imdb.com/title/tt00276751,About a Boy,Comedy|Drama|Romance,4.0,"[-0.0128667, -0.0258667, 0.00526667, 0.0211667..."
671,5445,1064891627,https://www.imdb.com/title/tt00181689,Minority Report,Action|Crime|Mystery|Sci-Fi|Thriller,4.5,"[0.01375, -0.01085, 0.0049, -0.0073, -0.0091, ..."
671,5464,1064891549,https://www.imdb.com/title/tt00257044,Road to Perdition,Crime|Drama,3.0,"[0.00893333, 0.0336333, -0.0175, 0.0165, 6.666..."
671,5669,1063502711,https://www.imdb.com/title/tt00310793,Bowling for Columbine,Documentary,4.0,"[-0.0211333, -0.0271667, 0.00773333, 0.0162, -..."
671,5816,1065111963,https://www.imdb.com/title/tt00295297,Harry Potter and the Chamber of Secrets,Adventure|Fantasy,4.0,"[0.00535714, -0.0219429, 0.00657143, 0.0020428..."
671,5902,1064245507,https://www.imdb.com/title/tt00268126,Adaptation,Comedy|Drama|Romance,3.5,"[-0.002, -0.0034, -0.0017, 0.0083, -0.0165, 0...."
