In [1]:
import string

import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from unidecode import unidecode

### Tratamento dos dados

In [2]:
movies_df = pd.read_csv('../data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
overview_df = pd.read_csv('../data/overviews.csv')
overview_df.head()

Unnamed: 0,movieId,tmdbId,overview
0,1,862,"Led by Woody, Andy's toys live happily in his ..."
1,2,8844,When siblings Judy and Peter discover an encha...
2,3,15602,A family wedding reignites the ancient feud be...
3,4,31357,"Cheated on, mistreated and stepped on, the wom..."
4,5,11862,Just when George Banks has recovered from his ...


In [4]:
ratings_df = pd.read_csv('../data/ratings.csv')
ratings_df.drop(columns=['timestamp'], inplace=True)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
ratings_df.shape

(100836, 3)

In [6]:
movies_df.shape

(9742, 3)

In [7]:
overview_df.shape

(9742, 3)

In [8]:
ratings_df.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [9]:
movies_df.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [10]:
overview_df.isna().sum()

movieId       0
tmdbId        0
overview    117
dtype: int64

In [11]:
list_movies_missing_overview = list(overview_df[overview_df.overview.isna()]['movieId'])

In [12]:
overview_df.dropna(inplace=True)
overview_df.isna().sum()

movieId     0
tmdbId      0
overview    0
dtype: int64

In [13]:
# remove ratings dos filmes 'dropados'

ratings_df = ratings_df[~ratings_df.movieId.isin(list_movies_missing_overview)]

In [14]:
#remove filmes 'dropados' do df de filmes

movies_df = movies_df[~movies_df.movieId.isin(list_movies_missing_overview)]

In [15]:
movies_df.shape

(9625, 3)

In [16]:
overview_df.shape

(9625, 3)

In [17]:
ratings_df.shape

(100525, 3)

In [18]:
experiment_df = movies_df.set_index('movieId').join(overview_df.set_index('movieId'), how='left')

In [19]:
experiment_df.reset_index(inplace=True)
experiment_df.head()

Unnamed: 0,movieId,title,genres,tmdbId,overview
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862,"Led by Woody, Andy's toys live happily in his ..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,8844,When siblings Judy and Peter discover an encha...
2,3,Grumpier Old Men (1995),Comedy|Romance,15602,A family wedding reignites the ancient feud be...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,31357,"Cheated on, mistreated and stepped on, the wom..."
4,5,Father of the Bride Part II (1995),Comedy,11862,Just when George Banks has recovered from his ...


In [20]:
experiment_df.replace('(no genres listed)', '', inplace=True)

In [21]:
# colocando os gêneros em uma lista de palavras
experiment_df['genres'] = experiment_df['genres'].map(lambda x: x.lower().split('|'))

In [22]:
experiment_df.drop(columns=['tmdbId'], inplace=True)
experiment_df.set_index('movieId', inplace=True)

In [23]:
experiment_df.head()

Unnamed: 0_level_0,title,genres,overview
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),"[adventure, animation, children, comedy, fantasy]","Led by Woody, Andy's toys live happily in his ..."
2,Jumanji (1995),"[adventure, children, fantasy]",When siblings Judy and Peter discover an encha...
3,Grumpier Old Men (1995),"[comedy, romance]",A family wedding reignites the ancient feud be...
4,Waiting to Exhale (1995),"[comedy, drama, romance]","Cheated on, mistreated and stepped on, the wom..."
5,Father of the Bride Part II (1995),[comedy],Just when George Banks has recovered from his ...


### Iniciando o processo de recomendação

#### Definindo funções

In [24]:
def nlp_pre_process(corpus, stop_words_remv = True, lemmatizetion = True, stemmization = True):
    corpus = corpus.lower()
    stopset = []
    if (stop_words_remv):
        stopset = stopwords.words('english')
        stopset += list(string.punctuation)

    if (lemmatizetion):
        lemmatizer = WordNetLemmatizer()
        corpus = " ".join([lemmatizer.lemmatize(i) for i in word_tokenize(corpus) if i not in stopset])
    
    if (stemmization):
        stemmer = PorterStemmer()
        corpus = " ".join([stemmer.stem(i) for i in word_tokenize(corpus)])
    
    # remove non-ascii characters
    corpus = unidecode(corpus)
    
    return corpus

In [25]:
def apply_rake(df):
    for index, row in df.iterrows():
        overview = row['overview']

        # instanciando Rake, por padrão usa palavras irrelevantes em inglês do NLTK 
        # e descarta todos os caracteres de pontuação
        r = Rake()

        # extraindo as palavras passando o texto
        r.extract_keywords_from_text(overview)

        # obtendo o dicionário com palavras-chave e suas pontuações
        key_words_dict_scores = r.get_word_degrees()

        # atribuindo as palavras-chave à nova coluna
        row['key_words'] = list(key_words_dict_scores.keys())

    return df

In [26]:
def create_bag_of_words(df):
    df['bag_of_words'] = ''
    columns = df.columns

    for index, row in df.iterrows():
        words = ''
        for col in columns:
            if col != 'title':
                words = words + ' '.join(row[col])+ ' '
        row['bag_of_words'] = words
        
    df.drop(columns = [col for col in df.columns if col != 'bag_of_words' and col != 'title'], inplace = True)

    return df

In [27]:
def pre_process(movies_df, stopwords_removal=True, lemmatization=True, stemmization=True):
    df = movies_df.copy()
    df['overview'].apply(nlp_pre_process, args=(stopwords_removal, lemmatization, stemmization))

    df['key_words'] = ''

    df = apply_rake(df)

    df.drop(columns=['overview'], inplace = True)

    df = create_bag_of_words(df)

    return df

In [28]:
def recommender(movies_interacteds, movies_not_interacteds, movie_id_list, cosine_similarity):

    movie_to_index = {movie_id:index for index, movie_id in enumerate(movie_id_list)}
    index_to_movie = {index:movie_id for index, movie_id in enumerate(movie_id_list)}

    similaritys = []

    # pegando lista de similaridades dos itens não interagidos
    for movie in movies_interacteds:
        similaritys.append([sim for i, sim in enumerate(cosine_similarity[movie_to_index[movie]]) if index_to_movie[i] in movies_not_interacteds])

    # obtendo a similaridade média
    num_of_items = len(movies_not_interacteds)
    sum_similarity = np.array([0.0] * num_of_items)
    for similarity in similaritys:
        sum_similarity += np.array(similarity)

    average_similarity = sum_similarity / num_of_items

    items_more_similar = pd.Series(average_similarity).sort_values(ascending = False)
    print(items_more_similar)
    # pegando o top 10 indices de filmes mais similares
    top_10_similarity = list(items_more_similar.iloc[0:10].index)

    movies_to_recommend = pd.Series(movies_not_interacteds)

    top_10_movies = list(movies_to_recommend[movies_to_recommend.index.isin(top_10_similarity)])

    return top_10_movies

In [29]:
def generate_similarity_matrix(df):
    # instanciando e gerando a matriz de contagem
    # count = CountVectorizer()
    count = TfidfVectorizer()
    count_matrix = count.fit_transform(df['bag_of_words'])

    # gerando a matriz de similaridade de cosseno
    cosine_sim = cosine_similarity(count_matrix, count_matrix)

    return cosine_sim

In [30]:
def get_recomendations(movies_df, ratings_df, user_id, rating_threshold=4.0, frac=0.5, seed=5):
    profile = ratings_df[ratings_df.userId == user_id]
    profile = profile[profile.rating >= rating_threshold]

    user_preference_1 = profile.sample(frac=frac, random_state=seed)
    user_preference_2 = profile[~profile.movieId.isin(user_preference_1.movieId)]

    cosine_sim = generate_similarity_matrix(movies_df)

    movies_list = list(movies_df.index)

    user_not_interact = movies_df[~movies_df.index.isin(user_preference_1.movieId)]

    user_interact_ids = list(user_preference_1.movieId)

    user_not_interact_ids = list(user_not_interact.index)

    movie_id_list = list(movies_df.index)

    movies_recomended = recommender(user_interact_ids, user_not_interact_ids, movie_id_list, cosine_sim)

    return movies_recomended

In [31]:
def evaluation_recommendation(movies_df, ratings_df, rating_threshold=4.0, frac=0.5, seed=5):
    movies_df = pre_process(movies_df, stopwords_removal=True, lemmatization=True, stemmization=True)

    user_ids = set(list(ratings_df.userId))

    for user_id in user_ids:
  
        movies_recomended = get_recomendations(movies_df, ratings_df, user_id, rating_threshold, frac, seed)
        
        try:
            with open('../result/recomendations.csv', 'a') as recomendations:
                recomendations.write('{},"{}"\n'.format(user_id, str(movies_recomended)[1:-1]))
        except Exception:
            print('Falha ao gravar as recomendações do id {}'.format(user_id))
            with open('../result/fails.csv', 'a') as fails:
                fails.write('{},"{}"\n'.format(user_id, str(movies_recomended)[1:-1]))
        else:
            print('Recomendações do id {} gravado.'.format(user_id))
    

In [38]:
combinations_pre_process = [
    (False,False,False),
    (False,False,True),
    (False,True,False),
    (False,True,True),
    (True,False,False),
    (True,False,True),
    (True,True,False),
    (True,True,True)
]

In [621]:
for combination in combinations_pre_process:
    stopword, lemma, stemm = combination
    print(combination)
    movies_df = pre_process(experiment_df, stopword, lemma, stemm)

(False, False, False)
(False, False, True)
(False, True, False)
(False, True, True)
(True, False, False)
(True, False, True)
(True, True, False)
(True, True, True)


In [None]:
evaluation_recommendation(experiment_df, ratings_df)

In [43]:
from time import sleep, perf_counter
from concurrent.futures import ThreadPoolExecutor


def task(id):
    print(f'Starting the task {id}...')
    sleep(id)
    return f'Done with task {id}'

start = perf_counter()

with ThreadPoolExecutor() as executor:
    results = executor.map(task, [1,2,3,4,5,6,7,8])
    for result in results:
        print(result)

finish = perf_counter()

print(f"It took {finish-start} second(s) to finish.")

Starting the task 1...
Starting the task 2...
Starting the task 3...
Starting the task 4...
Starting the task 5...
Starting the task 6...
Starting the task 7...
Starting the task 8...
Done with task 1
Done with task 2
Done with task 3
Done with task 4
Done with task 5
Done with task 6
Done with task 7
Done with task 8
It took 8.012819829000364 second(s) to finish.
