In [1]:
import pandas as pd

from src import utils
from src import pre_processing

In [2]:
# lendo e pré-processando os dados (json to dataframe)
data = pre_processing.pre_processing_reli_dataset('../datasets/reli/')
data.head(1)

Unnamed: 0,tokens,aspect_tags,author
0,"[Um, livro, muito, bom, que, retrata, a, cruel...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",amado


In [3]:
# contabilizando o número de aspectos por registro
data = utils.aspect_counter(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,author,num_aspects
0,"[Um, livro, muito, bom, que, retrata, a, cruel...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",amado,1


In [4]:
# definindo se um registro possui aspecto ou não
data = utils.has_aspect(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,author,num_aspects,has_aspect
0,"[Um, livro, muito, bom, que, retrata, a, cruel...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",amado,1,1


In [5]:
# contabilizando reviews por author do dataset original
author_summary = data.author.value_counts(normalize=True).reset_index().rename(
    columns={'index': 'author', 'author': 'original_dataset'})
author_summary

Unnamed: 0,author,original_dataset
0,meyer,0.258018
1,orwell,0.187399
2,saramago,0.158656
3,sheldon,0.124488
4,amado,0.107071
5,salinger,0.094508
6,reboucas,0.069858


In [6]:
# contabilizando a quantidade de registros por número de aspectos na frase do dataset original
num_aspects_summary = data.num_aspects.value_counts(normalize=True).reset_index().rename(
    columns={'num_aspects': 'original_dataset', 'index': 'num_aspects'})
num_aspects_summary

Unnamed: 0,num_aspects,original_dataset
0,0,0.838489
1,1,0.140192
2,2,0.017227
3,3,0.002855
4,4,0.001047
5,5,0.00019


In [7]:
# contabilizando a quantidade de registros com e sem aspectos do dataset original
aspects_summary = data.has_aspect.value_counts(normalize=True).reset_index().rename(
    columns={'index': 'has_aspect', 'has_aspect': 'original_dataset'})
aspects_summary

Unnamed: 0,has_aspect,original_dataset
0,0,0.838489
1,1,0.161511


In [8]:
# nome das colunas para estraficacão
y_col = 'author'
X_cols = ['tokens', 'aspect_tags', 'num_aspects', 'has_aspect']

In [9]:
# gerando os folds com estratificação
data = utils.stratified_k_fold(data, X_cols, y_col, k=10)

In [10]:
# número de registros por fold
data.fold.value_counts().reset_index()

Unnamed: 0,index,fold
0,1,1051
1,2,1051
2,3,1051
3,4,1051
4,5,1051
5,6,1051
6,7,1051
7,8,1050
8,9,1050
9,10,1050


In [14]:
# lendo a base salva
data = pd.read_csv('../datasets/processed/reli_stratified.csv')

In [15]:
# gerando estatísticas dos folds
for fold in data.fold.unique():

    # filtrando
    curr = data[data.fold == fold]

    # contabilizando reviews por author
    curr_author_summary = curr.author.value_counts(normalize=True).reset_index().rename(
        columns={'index': 'author', 'author': f'fold_{fold}'})
    author_summary = author_summary.merge(curr_author_summary, how='outer')

    # contabilizando a quantidade de registros por número de aspectos na frase
    curr_num_aspects_summary = curr.num_aspects.value_counts(normalize=True).reset_index().rename(
        columns={'num_aspects': f'fold_{fold}', 'index': 'num_aspects'})
    num_aspects_summary = num_aspects_summary.merge(curr_num_aspects_summary, how='outer')

    # contabilizando a quantidade de registros com e sem aspectos
    curr_aspects_summary = curr.has_aspect.value_counts(normalize=True).reset_index().rename(
        columns={'has_aspect': f'fold_{fold}', 'index': 'has_aspect'})
    aspects_summary = aspects_summary.merge(curr_aspects_summary, how='outer')

In [16]:
# reviews por author
for col in author_summary.iloc[:, 1:].columns:
    author_summary[col] = author_summary[col] * 100
author_summary['fold_avg'] = author_summary.iloc[:, 2:].mean(axis=1)
author_summary['fold_std'] = author_summary.iloc[:, 2:].std(axis=1)
author_summary = round(author_summary, 2)
author_summary

Unnamed: 0,author,original_dataset,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,meyer,25.8,25.78,25.78,25.78,25.78,25.78,25.88,25.78,25.81,25.81,25.81,25.8,0.03
1,orwell,18.74,18.74,18.74,18.74,18.74,18.74,18.65,18.74,18.76,18.76,18.76,18.74,0.03
2,saramago,15.87,15.79,15.79,15.89,15.89,15.89,15.89,15.89,15.9,15.9,15.81,15.87,0.04
3,sheldon,12.45,12.46,12.46,12.46,12.46,12.46,12.46,12.46,12.38,12.38,12.48,12.45,0.03
4,amado,10.71,10.75,10.75,10.75,10.75,10.75,10.66,10.66,10.67,10.67,10.67,10.71,0.04
5,salinger,9.45,9.51,9.51,9.42,9.42,9.42,9.42,9.42,9.43,9.43,9.52,9.45,0.04
6,reboucas,6.99,6.95,6.95,6.95,6.95,6.95,7.04,7.04,7.05,7.05,6.95,6.99,0.05


In [17]:
# quantidade de registros por número de aspectos na frase
for col in num_aspects_summary.iloc[:, 1:].columns:
    num_aspects_summary[col] = num_aspects_summary[col] * 100
num_aspects_summary['fold_avg'] = num_aspects_summary.iloc[:, 2:].mean(axis=1)
num_aspects_summary['fold_std'] = num_aspects_summary.iloc[:, 2:].std(axis=1)
num_aspects_summary = round(num_aspects_summary, 2)
num_aspects_summary

Unnamed: 0,num_aspects,original_dataset,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,0,83.85,84.11,86.87,88.87,90.1,84.21,81.92,80.59,83.24,81.71,76.86,83.85,3.77
1,1,14.02,13.99,11.23,9.51,8.56,13.89,15.51,16.94,14.57,16.29,19.71,14.02,3.26
2,2,1.72,1.43,1.71,1.43,1.05,1.43,2.28,1.81,1.52,1.81,2.76,1.72,0.47
3,3,0.29,0.19,0.19,0.19,0.19,0.29,0.19,0.57,0.57,,0.48,0.32,0.16
4,4,0.1,0.19,,,0.1,0.19,0.1,0.1,0.1,0.19,0.1,0.13,0.05
5,5,0.02,0.1,,,,,,,,,0.1,0.1,0.0


In [18]:
# tem ou não aspecto
for col in aspects_summary.iloc[:, 1:].columns:
    aspects_summary[col] = aspects_summary[col] * 100
aspects_summary['fold_avg'] = aspects_summary.iloc[:, 2:].mean(axis=1)
aspects_summary['fold_std'] = aspects_summary.iloc[:, 2:].std(axis=1)
aspects_summary = round(aspects_summary, 2)
aspects_summary

Unnamed: 0,has_aspect,original_dataset,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,0,83.85,84.11,86.87,88.87,90.1,84.21,81.92,80.59,83.24,81.71,76.86,83.85,3.77
1,1,16.15,15.89,13.13,11.13,9.9,15.79,18.08,19.41,16.76,18.29,23.14,16.15,3.77
