In [1]:
import pandas as pd

from src import utils
from src import pre_processing

In [2]:
# lendo e pré-processando os dados (json to dataframe)
data = pre_processing.pre_processing_reli_dataset('../datasets/reli/')
data.head(1)

Unnamed: 0,tokens,aspect_tags,review,author,sentence
0,"[Um, livro, muito, bom, que, retrata, a, cruel...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,amado,1


In [3]:
# contabilizando o número de aspectos por registro
data = utils.aspect_counter(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,review,author,sentence,total_aspects
0,"[Um, livro, muito, bom, que, retrata, a, cruel...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,amado,1,1


In [4]:
# definindo se um registro possui aspecto ou não
data = utils.has_aspect(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,review,author,sentence,total_aspects,has_aspect
0,"[Um, livro, muito, bom, que, retrata, a, cruel...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,amado,1,1,1


In [5]:
# contabilizando reviews por author do dataset original
reviews_per_author = utils.summary(data, 'author', 'review', 'nunique')
reviews_per_author = reviews_per_author.rename(columns={'review': 'dataset_original'})

In [6]:
# contabilizando o número de aspectos por autor
aspects_per_author = utils.summary(data, 'author', 'total_aspects', 'sum')
aspects_per_author = aspects_per_author.rename(columns={'total_aspects': 'dataset_original'})

In [7]:
# contabilizando número de revisões com aspectos
reviews_with_aspects = utils.summary(data, 'author', 'has_aspect', 'sum')
reviews_with_aspects = reviews_with_aspects.rename(columns={'has_aspect': 'dataset_original'})

In [8]:
# nome das colunas para estraficacão
y = 'has_aspect'
X = ['tokens', 'aspect_tags', 'total_aspects', 'has_aspect', 'author', 'sentence']
groups = 'review'

In [9]:
def agg(original_data, fold_data, fold, colname, agg_name):
    df = utils.summary(fold_data, 'author', colname, agg_name)
    df = df.rename(columns={colname: f'fold_{fold}'})
    return original_data.merge(df, how='outer')

def test(data, threshold):
    return data.groupby(['fold']).agg({'review': 'nunique'}).review.std() <= threshold

In [None]:
checker = True
stritified_data = None
while checker:
    new_data = utils.stratified_k_fold(data, X, y, groups, k=10)
    reviews_per_author_ = reviews_per_author.copy()
    aspects_per_author_ = aspects_per_author.copy()
    if test(new_data, 3.):
        stritified_data = new_data.copy()
        checker = False

In [None]:
# salvando
# stritified_data.to_csv('../datasets/processed/reli_stratified.csv', index=False)

In [10]:
# lendo os dados
data = pd.read_csv('../datasets/processed/reli_stratified.csv')

In [11]:
# número de reviews por fold
data.groupby(['fold']).agg({'review': 'nunique'}).reset_index()

Unnamed: 0,fold,review
0,1,145
1,2,142
2,3,145
3,4,141
4,5,140
5,6,147
6,7,147
7,8,146
8,9,141
9,10,141


In [12]:
# estatísticas do dataset
for fold in data.fold.unique():
    curr = data[data.fold == fold]
    reviews_per_author = agg(reviews_per_author, curr, fold, 'review', 'nunique')
    aspects_per_author = agg(aspects_per_author, curr, fold, 'total_aspects', 'sum')
    reviews_with_aspects = agg(reviews_with_aspects, curr, fold, 'has_aspect', 'sum')

In [13]:
# reviews por author
reviews_per_author = utils.fold_summary(reviews_per_author)
reviews_per_author

Unnamed: 0,author,dataset_original,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,meyer,355,39,36,38,35,37,33,38,33,31,35,35.5,2.5
1,saramago,250,30,20,25,28,24,25,27,25,24,22,25.0,2.7
2,sheldon,206,18,24,22,19,19,22,25,20,17,20,20.6,2.5
3,orwell,184,16,22,15,20,18,18,14,25,20,16,18.4,3.2
4,amado,173,16,19,22,17,13,19,18,16,18,15,17.3,2.4
5,reboucas,140,17,13,13,10,17,15,12,10,14,19,14.0,2.9
6,salinger,127,9,8,10,12,12,15,13,17,17,14,12.7,3.0
7,total,1435,145,142,145,141,140,147,147,146,141,141,143.5,2.6


In [14]:
# número de aspectos por autor
aspects_per_author = utils.fold_summary(aspects_per_author)
aspects_per_author

Unnamed: 0,author,dataset_original,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,meyer,598,62,43,72,72,74,59,65,47,51,53,59.8,10.5
1,sheldon,358,15,43,41,26,39,38,52,48,28,28,35.8,10.7
2,orwell,266,20,24,25,36,32,38,10,29,30,22,26.6,7.8
3,amado,240,14,27,40,17,21,22,18,25,31,25,24.0,7.2
4,saramago,203,26,12,23,17,11,21,26,25,21,21,20.3,5.1
5,salinger,165,14,14,15,11,9,25,19,20,21,17,16.5,4.6
6,reboucas,149,10,16,19,18,22,16,15,9,15,9,14.9,4.2
7,total,1979,161,179,235,197,208,219,205,203,197,175,197.9,20.6


In [15]:
# número de reviews com aspecto por autor
reviews_with_aspects = utils.fold_summary(reviews_with_aspects)
reviews_with_aspects

Unnamed: 0,author,dataset_original,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,meyer,493,54,39,60,60,61,49,46,38,43,43,49.3,8.4
1,sheldon,295,13,37,35,22,26,35,43,38,23,23,29.5,8.9
2,orwell,244,20,24,25,29,27,34,10,28,28,19,24.4,6.3
3,amado,204,12,22,30,17,21,20,15,23,25,19,20.4,4.9
4,saramago,180,21,12,19,15,10,21,22,22,19,19,18.0,4.0
5,salinger,144,10,12,15,11,8,21,17,17,20,13,14.4,4.1
6,reboucas,137,10,13,17,15,21,15,14,9,14,9,13.7,3.6
7,total,1697,140,159,201,169,174,195,167,175,172,145,169.7,18.1
