In [1]:
import pandas as pd

from src import utils
from src import pre_processing

In [2]:
# lendo e pré-processando os dados (json to dataframe)
data = pre_processing.pre_processing_reli_dataset('../datasets/reli/')
data.head(1)

Unnamed: 0,tokens,aspect_tags,review,author,sentence
0,"[Um, livro, muito, bom, que, retrata, a, cruel...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,amado,1


In [3]:
# contabilizando o número de aspectos por registro
data = utils.aspect_counter(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,review,author,sentence,total_aspects
0,"[Um, livro, muito, bom, que, retrata, a, cruel...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,amado,1,1


In [4]:
# definindo se um registro possui aspecto ou não
data = utils.has_aspect(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,review,author,sentence,total_aspects,has_aspect
0,"[Um, livro, muito, bom, que, retrata, a, cruel...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,amado,1,1,1


In [5]:
# contabilizando reviews por author do dataset original
reviews_per_author = utils.summary(data, 'author', 'review', 'nunique')
reviews_per_author = reviews_per_author.rename(columns={'review': 'dataset_original'})

In [6]:
# contabilizando o número de aspectos por autor
aspects_per_author = utils.summary(data, 'author', 'total_aspects', 'sum')
aspects_per_author = aspects_per_author.rename(columns={'total_aspects': 'dataset_original'})

In [7]:
# contabilizando número de revisões com aspectos
reviews_with_aspects = utils.summary(data, 'author', 'has_aspect', 'sum')
reviews_with_aspects = reviews_with_aspects.rename(columns={'has_aspect': 'dataset_original'})

In [8]:
# nome das colunas para estraficacão
y = 'has_aspect'
X = ['tokens', 'aspect_tags', 'total_aspects', 'has_aspect', 'author', 'sentence']
groups = 'review'

In [9]:
def agg(original_data, fold_data, fold, colname, agg_name):
    df = utils.summary(fold_data, 'author', colname, agg_name)
    df = df.rename(columns={colname: f'fold_{fold}'})
    return original_data.merge(df, how='outer')

def test(data, threshold):
    return data.groupby(['fold']).agg({'review': 'nunique'}).review.std() <= threshold

In [10]:
checker = True
stritified_data = None
while checker:
    new_data = utils.stratified_k_fold(data, X, y, groups, k=10)
    reviews_per_author_ = reviews_per_author.copy()
    aspects_per_author_ = aspects_per_author.copy()
    if test(new_data, 3.):
        stritified_data = new_data.copy()
        checker = False

In [11]:
# salvando
# stritified_data.to_csv('../datasets/processed/reli_stratified.csv', index=False)

In [12]:
# lendo os dados
data = pd.read_csv('../datasets/processed/reli_stratified.csv')

In [13]:
# número de reviews por fold
data.groupby(['fold']).agg({'review': 'nunique'}).reset_index()

Unnamed: 0,fold,review
0,1,145
1,2,142
2,3,145
3,4,141
4,5,140
5,6,147
6,7,147
7,8,146
8,9,141
9,10,141


In [14]:
# estatísticas do dataset
for fold in data.fold.unique():
    curr = data[data.fold == fold]
    reviews_per_author = agg(reviews_per_author, curr, fold, 'review', 'nunique')
    aspects_per_author = agg(aspects_per_author, curr, fold, 'total_aspects', 'sum')
    reviews_with_aspects = agg(reviews_with_aspects, curr, fold, 'has_aspect', 'sum')

In [15]:
# reviews por author
reviews_per_author = utils.fold_summary(reviews_per_author)
reviews_per_author

Unnamed: 0,author,dataset_original,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,meyer,24.74,26.9,25.35,26.21,24.82,26.43,22.45,25.85,22.6,21.99,24.82,24.74,1.69
1,saramago,17.42,20.69,14.08,17.24,19.86,17.14,17.01,18.37,17.12,17.02,15.6,17.41,1.8
2,sheldon,14.36,12.41,16.9,15.17,13.48,13.57,14.97,17.01,13.7,12.06,14.18,14.34,1.6
3,orwell,12.82,11.03,15.49,10.34,14.18,12.86,12.24,9.52,17.12,14.18,11.35,12.83,2.28
4,amado,12.06,11.03,13.38,15.17,12.06,9.29,12.93,12.24,10.96,12.77,10.64,12.05,1.57
5,reboucas,9.76,11.72,9.15,8.97,7.09,12.14,10.2,8.16,6.85,9.93,13.48,9.77,2.07
6,salinger,8.85,6.21,5.63,6.9,8.51,8.57,10.2,8.84,11.64,12.06,9.93,8.85,2.06


In [16]:
# número de aspectos por autor
aspects_per_author = utils.fold_summary(aspects_per_author)
aspects_per_author

Unnamed: 0,author,dataset_original,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,meyer,30.22,38.51,24.02,30.64,36.55,35.58,26.94,31.71,23.15,25.89,30.29,30.33,5.08
1,sheldon,18.09,9.32,24.02,17.45,13.2,18.75,17.35,25.37,23.65,14.21,16.0,17.93,4.9
2,orwell,13.44,12.42,13.41,10.64,18.27,15.38,17.35,4.88,14.29,15.23,12.57,13.44,3.6
3,amado,12.13,8.7,15.08,17.02,8.63,10.1,10.05,8.78,12.32,15.74,14.29,12.07,3.07
4,saramago,10.26,16.15,6.7,9.79,8.63,5.29,9.59,12.68,12.32,10.66,12.0,10.38,2.98
5,salinger,8.34,8.7,7.82,6.38,5.58,4.33,11.42,9.27,9.85,10.66,9.71,8.37,2.19
6,reboucas,7.53,6.21,8.94,8.09,9.14,10.58,7.31,7.32,4.43,7.61,5.14,7.48,1.77


In [18]:
# número de reviews com aspecto por autor
reviews_with_aspects = utils.fold_summary(reviews_with_aspects)
reviews_with_aspects

Unnamed: 0,author,dataset_original,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,meyer,29.05,38.57,24.53,29.85,35.5,35.06,25.13,27.54,21.71,25.0,29.66,29.26,5.27
1,sheldon,17.38,9.29,23.27,17.41,13.02,14.94,17.95,25.75,21.71,13.37,15.86,17.26,4.82
2,orwell,14.38,14.29,15.09,12.44,17.16,15.52,17.44,5.99,16.0,16.28,13.1,14.33,3.18
3,amado,12.02,8.57,13.84,14.93,10.06,12.07,10.26,8.98,13.14,14.53,13.1,11.95,2.2
4,saramago,10.61,15.0,7.55,9.45,8.88,5.75,10.77,13.17,12.57,11.05,13.1,10.73,2.71
5,salinger,8.49,7.14,7.55,7.46,6.51,4.6,10.77,10.18,9.71,11.63,8.97,8.45,2.06
6,reboucas,8.07,7.14,8.18,8.46,8.88,12.07,7.69,8.38,5.14,8.14,6.21,8.03,1.73
