In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
df_ratings = pd.read_csv('data/movies/ratings.csv')
df_movies = pd.read_csv('data/movies/movies.csv')
df_links = pd.read_csv('data/movies/links.csv')

In [1]:
from gensim.models import KeyedVectors
ft_model = KeyedVectors.load_word2vec_format('models/wiki-news-300d-1M-subword.vec')

In [2]:
print (ft_model.most_similar('desk'))

[('desks', 0.8139737844467163), ('desk-', 0.8030417561531067), ('desk.', 0.778192400932312), ('front-desk', 0.7296270132064819), ('ref-desk', 0.7272905111312866), ('deskside', 0.7197455167770386), ('help-desk', 0.715452253818512), ('writing-desk', 0.7056628465652466), ('refdesk', 0.6872211694717407), ('Desk', 0.6861226558685303)]


In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

def clean_text(text, remove_stopwords=True):
    '''Clean the text, with the option to remove stopwords'''
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("russian"))
        tokens = [w for w in tokens if not w in stops]
    
#     text = " ".join(tokens)
    return tokens

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

df_movies = df_movies.drop(df_movies[df_movies['title'].apply(lambda t: not is_ascii(t))].index)

def process_title(title): 
    # strip away numbers and parenthesis
    title = title.replace('(','').replace(')','')
    title = re.sub(r'\d+','',title)
    # strip away "part" word
    title = re.sub(r'[Pp]art','',title)
    #strip II and III and IV
    title = title.replace('II','').replace('III','').replace('IV','')
    return title

df_movies['title'] = df_movies['title'].apply(process_title) 
#drop empty titles
df_movies = df_movies.drop(df_movies[df_movies['title'].str.strip() ==''].index)

In [None]:
vec_dim = 300

def create_average_vec(doc):
    average = np.zeros((vec_dim,), dtype='float32')
    num_words = 0.
    for word in doc:
        if word in ft_model.wv.vocab:
            average = np.add(average, ft_model[word])
            num_words += 1.
    if num_words != 0.:
        average = np.divide(average, num_words)
    return average


def create_doc2vec(text):
    processed_text = clean_text(text)
    vec = create_average_vec(processed_text)
    return vec

data['doc2vec'] = urls['title'].apply(create_doc2vec)
urls.to_csv('data/movies/movies_titles_with_fasttext.csv')

In [None]:
# Create links to imdb website
def make_urls(df_links):
    df_links['pagePath'] = 'https://www.imdb.com/title/tt00' + df_links['imdbId'].astype(str)
    df_links.drop(['imdbId'],axis=1,inplace=True)
    df_links.drop(['tmdbId'],axis=1,inplace=True)
    return df_links
df_links = make_urls(df_links)

In [None]:
# Merge by userId all csv files
def merge_df(df_ratings, df_links, df_movies):
    df_result = pd.merge(df_ratings, df_links, on='movieId', how='left')
    df_result = pd.merge(df_result, df_movies, on='movieId', how='left')
    df_result['visitStartTime'] = df_result['timestamp']
    df_result.set_index(['userId', 'movieId'], inplace=True)
    df_result = df_result[['visitStartTime', 'pagePath', 'title', 'genres', 'rating']]
    return df_result
df_result = merge_df(df_ratings, df_links, df_movies)