## Leitor compulsivo

The reviews were taken from the blog Leitor Compulsivo.

In [None]:
import os
import pandas as pd
import nltk

In [None]:
path_lc = '../corpus/leitor_compulsivo/'
files_lc = [file for file in os.listdir(path_lc) if '.gitkeep' not in file]

In [None]:
data_lc = pd.DataFrame()
for i, file in enumerate(files_lc):
    with open(os.path.join(path_lc, file), 'r') as f:
        review = f.read()
        curr_review = pd.DataFrame({
            'text': review,
            'review': [i+1],
            'bookname_author': [file.replace('.txt', '')]
        })
        data_lc = pd.concat([data_lc, curr_review])
data_lc.shape

In [None]:
data_lc.to_csv('../corpus/preprocessed/reviews_lc.csv', index=False)

In [None]:
def reviews_to_sentences(input_path, output_path):
    data = pd.read_csv(input_path).dropna()
    reviews = [nltk.sent_tokenize(review) for review in data.text]
    with open(output_path, 'a') as f:
        for review in reviews:
            for sentence in review:
                f.write(sentence + '\n')
    return True

In [None]:
reviews_to_sentences('../corpus/preprocessed/reviews_lc.csv',
                     '../corpus/preprocessed/reviews_lc.txt')

## Minha vida literaria

In [None]:
path_mvl = '../corpus/minha_vida_literaria/'
files_mvl = [file for file in os.listdir(path_mvl) if '.gitkeep' not in file]

In [None]:
to_remove = [
    'Ficha Técnica',
    'Título:',
    'Título original:',
    'Autor:',
    'Tradução:',
    'Editora:',
    'Número de Páginas:',
    'Ano de Publicação:',
    'Data de Publicação:',
    'Skoob:',
    'Compre:',
    'Resenha:'
]

In [None]:
data_mvl = pd.DataFrame()
for i, file in enumerate(files_mvl):
    review = []
    with open(os.path.join(path_mvl, file), 'r') as f:
        curr_review = [line for line in f.read().split(
            '\n') if line and (line != '\xa0')]
        new_review = []
        for line in curr_review:
            if not [True for tr in to_remove if tr in line]:
                new_review.append(line)
        new_review = ''.join(new_review).replace('\xa0', '')
        curr_review = pd.DataFrame({
            'text': new_review,
            'review': [i+1],
            'bookname': [file.replace('.txt', '')]
        })
        data_mvl = pd.concat([data_mvl, curr_review])
data_mvl.review = data_mvl.review.astype(int)
data_mvl.shape

In [None]:
data_mvl.head()

In [None]:
data_mvl.to_csv('../corpus/preprocessed/reviews_mvl.csv', index=False)

In [None]:
reviews_to_sentences('../corpus/preprocessed/reviews_mvl.csv',
                     '../corpus/preprocessed/reviews_mvl.txt')

# TV

In [None]:
import pandas as pd

from nltk.tokenize import sent_tokenize

In [None]:
# lendo os dados da b2w
data = pd.read_csv('../corpus/tv/b2w_reviews.csv', low_memory=False)

In [None]:
# colunas que serão utilizadas (categoria e review)
cat_col = 'site_category_lv2'
review_col = 'review_text'

In [None]:
# filtrando apenas reviews de tv
b2w_reviews = data[data[cat_col] == 'TV'][review_col].values.tolist()
len(b2w_reviews)

In [None]:
# lendo os dados do mercado livre
with open('../corpus/tv/meli_tv_reviews.txt', 'r') as f:
    meli = f.read().split('\n')

# removendo reviews com menos de 2 caracteres
meli_reviews = [review for review in meli if len(review) > 1]
len(meli_reviews), meli_reviews[:3]

In [None]:
# lendo os dados da amazon
with open('../corpus/tv/amazon_reviews.txt', 'r') as f:
    amazon = f.read().split('\n')

# removendo reviews com menos de 2 caracteres
amazon_reviews = [review for review in amazon if len(review) > 1]
len(amazon), amazon_reviews[:3]

In [None]:
# juntando todos os reviews
reviews = b2w_reviews + meli_reviews + amazon_reviews
len(reviews)

In [None]:
# removendo o que não for str
reviews = [review for review in reviews if isinstance(review, str)]
len(reviews)

In [None]:
with open('../corpus/preprocessed/tv_reviews.txt', 'w') as f:
    f.write('\n'.join(reviews))

In [None]:
# quantidade de tokens
sentence_size = 0
for sentence in reviews:
    sentence_size += len(sentence.split())
sentence_size