In [1]:
import pandas as pd

from src import utils
from src import pre_processing

In [2]:
# lendo e pré-processando os dados (json to dataframe)
data = pre_processing.pre_processing_reli_dataset('../datasets/reli/')
data.head(1)

Unnamed: 0,tokens,aspect_tags,review,author,sentence
0,"[Um, livro, muito, bom, que, retrata, a, cruel...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,amado,1


In [3]:
# contabilizando o número de aspectos por registro
data = utils.aspect_counter(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,review,author,sentence,total_aspects
0,"[Um, livro, muito, bom, que, retrata, a, cruel...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,amado,1,1


In [4]:
# definindo se um registro possui aspecto ou não
data = utils.has_aspect(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,review,author,sentence,total_aspects,has_aspect
0,"[Um, livro, muito, bom, que, retrata, a, cruel...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,amado,1,1,1


In [5]:
# contabilizando reviews por author do dataset original
reviews_per_author = utils.summary(data, 'author', 'review', 'nunique')
reviews_per_author = reviews_per_author.rename(columns={'review': 'dataset_original'})

In [6]:
# contabilizando o número de aspectos por autor
aspects_per_author = utils.summary(data, 'author', 'total_aspects', 'sum')
aspects_per_author = aspects_per_author.rename(columns={'total_aspects': 'dataset_original'})

In [7]:
# contabilizando número de revisões com aspectos
reviews_with_aspects = utils.summary(data, 'author', 'has_aspect', 'sum')
reviews_with_aspects = reviews_with_aspects.rename(columns={'has_aspect': 'dataset_original'})

In [8]:
# nome das colunas para estraficacão
y = 'has_aspect'
X = ['tokens', 'aspect_tags', 'total_aspects', 'has_aspect', 'author', 'sentence']
groups = 'review'

In [9]:
def agg(original_data, fold_data, fold, colname, agg_name):
    df = utils.summary(fold_data, 'author', colname, agg_name)
    df = df.rename(columns={colname: f'fold_{fold}'})
    return original_data.merge(df, how='outer')

def test(data, threshold):
    return data.groupby(['fold']).agg({'review': 'nunique'}).review.std() <= threshold

In [10]:
# salvando
# stritified_data.to_csv('../datasets/processed/reli_stratified.csv', index=False)

In [11]:
# lendo os dados
data = pd.read_csv('../datasets/processed/reli_stratified.csv')

In [12]:
# número de reviews por fold
data.groupby(['fold']).agg({'review': 'nunique'}).reset_index()

Unnamed: 0,fold,review
0,1,145
1,2,142
2,3,145
3,4,141
4,5,140
5,6,147
6,7,147
7,8,146
8,9,141
9,10,141


In [13]:
# estatísticas do dataset
def gen_stats():
    stats_a, stats_b, stats_c = reviews_per_author.copy(), aspects_per_author.copy(), reviews_with_aspects.copy()
    for fold in data.fold.unique():
        curr = data[data.fold == fold]
        stats_a = agg(stats_a, curr, fold, 'review', 'nunique')
        stats_b = agg(stats_b, curr, fold, 'total_aspects', 'sum')
        stats_c = agg(stats_c, curr, fold, 'has_aspect', 'sum')
    return stats_a, stats_b, stats_c

In [14]:
# gerando as estatísticas
rpa_final, apa_final, rwa_final = gen_stats()

In [15]:
# reviews por author
rpa_final = utils.fold_summary(rpa_final)
rpa_final

Unnamed: 0,author,dataset_original,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,meyer,355,39,36,38,35,37,33,38,33,31,35,35.5,2.5
1,saramago,250,30,20,25,28,24,25,27,25,24,22,25.0,2.7
2,sheldon,206,18,24,22,19,19,22,25,20,17,20,20.6,2.5
3,orwell,184,16,22,15,20,18,18,14,25,20,16,18.4,3.2
4,amado,173,16,19,22,17,13,19,18,16,18,15,17.3,2.4
5,reboucas,140,17,13,13,10,17,15,12,10,14,19,14.0,2.9
6,salinger,127,9,8,10,12,12,15,13,17,17,14,12.7,3.0
7,total,1435,145,142,145,141,140,147,147,146,141,141,143.5,2.6


In [16]:
# número de aspectos por autor
apa_final = utils.fold_summary(apa_final)
apa_final

Unnamed: 0,author,dataset_original,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,meyer,598,62,43,72,72,74,59,65,47,51,53,59.8,10.5
1,sheldon,358,15,43,41,26,39,38,52,48,28,28,35.8,10.7
2,orwell,266,20,24,25,36,32,38,10,29,30,22,26.6,7.8
3,amado,240,14,27,40,17,21,22,18,25,31,25,24.0,7.2
4,saramago,203,26,12,23,17,11,21,26,25,21,21,20.3,5.1
5,salinger,165,14,14,15,11,9,25,19,20,21,17,16.5,4.6
6,reboucas,149,10,16,19,18,22,16,15,9,15,9,14.9,4.2
7,total,1979,161,179,235,197,208,219,205,203,197,175,197.9,20.6


In [17]:
# número de reviews com aspecto por autor
rwa_final = utils.fold_summary(rwa_final)
rwa_final

Unnamed: 0,author,dataset_original,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,meyer,493,54,39,60,60,61,49,46,38,43,43,49.3,8.4
1,sheldon,295,13,37,35,22,26,35,43,38,23,23,29.5,8.9
2,orwell,244,20,24,25,29,27,34,10,28,28,19,24.4,6.3
3,amado,204,12,22,30,17,21,20,15,23,25,19,20.4,4.9
4,saramago,180,21,12,19,15,10,21,22,22,19,19,18.0,4.0
5,salinger,144,10,12,15,11,8,21,17,17,20,13,14.4,4.1
6,reboucas,137,10,13,17,15,21,15,14,9,14,9,13.7,3.6
7,total,1697,140,159,201,169,174,195,167,175,172,145,169.7,18.1


Balancear manualmente o número de aspectos por particão (aspects_per_author).

In [18]:
def rebalance(df, n, fold_from, fold_to):
    count = n
    while count > 0:
        tmp = df[df.fold == fold_from].groupby(['review']).agg({'total_aspects': 'sum'}).reset_index()
        tmp = tmp[(tmp.total_aspects > 0) & (tmp.total_aspects <= count)].sample(frac=0.1)
        review = tmp.review.values[-1]
        count -= tmp.total_aspects.values[-1]
        df.loc[df.review == review, 'fold'] = fold_to
    return df

In [69]:
# tirando do 8 e passando para o 6
data = rebalance(data, 1, 9, 2)

# verificar o balanceamento
_, apa_final, _ = gen_stats()
apa_final = utils.fold_summary(apa_final)
apa_final

Unnamed: 0,author,dataset_original,fold_1,fold_5,fold_3,fold_2,fold_10,fold_4,fold_6,fold_7,fold_8,fold_9,fold_avg,fold_std
0,meyer,598,74,75,59,41,50,79,57,64,44,55,59.8,12.4
1,sheldon,358,22,31,34,49,37,23,34,51,48,29,35.8,9.9
2,orwell,266,14,31,31,28,29,34,34,9,29,27,26.6,7.9
3,amado,240,37,14,21,34,26,17,16,19,25,31,24.0,7.5
4,saramago,203,25,11,24,13,23,17,21,23,25,21,20.3,4.7
5,salinger,165,14,10,15,21,18,10,20,17,19,21,16.5,3.9
6,reboucas,149,11,26,14,12,15,18,16,15,8,14,14.9,4.5
7,total,1979,197,198,198,198,198,198,198,198,198,198,197.9,0.3


In [84]:
def rebalance_reviews(df, z, n, fold_from, fold_to):
    tmp = data[data.fold == fold_from].groupby(['review']).agg({'total_aspects': 'sum'}).reset_index()
    tmp = tmp[tmp.total_aspects == z].sample(frac=1.).head(n)
    df.loc[df.review.isin(tmp.review.values), 'fold'] = fold_to
    return df

In [92]:
data = rebalance_reviews(data, 0, 1, 6, 7)
rpa_final, _, _ = gen_stats()
rpa_final = utils.fold_summary(rpa_final)
rpa_final

Unnamed: 0,author,dataset_original,fold_1,fold_5,fold_3,fold_2,fold_10,fold_9,fold_4,fold_6,fold_7,fold_8,fold_avg,fold_std
0,meyer,355,44,36,33,34,36,32,38,34,37,31,35.5,3.5
1,saramago,250,28,27,27,19,17,24,28,30,25,25,25.0,3.9
2,sheldon,206,19,15,21,26,25,18,17,20,24,21,20.6,3.4
3,orwell,184,12,19,19,20,21,20,19,16,13,25,18.4,3.6
4,amado,173,19,13,20,20,16,18,17,14,20,16,17.3,2.4
5,reboucas,140,13,20,13,13,17,14,12,17,12,9,14.0,3.0
6,salinger,127,8,13,11,11,13,17,12,14,12,16,12.7,2.5
7,total,1435,143,143,144,143,145,143,143,145,143,143,143.5,0.8


In [93]:
# gerando as estatísticas
rpa_final, apa_final, rwa_final = gen_stats()

In [94]:
rpa_final = utils.fold_summary(rpa_final)
rpa_final

Unnamed: 0,author,dataset_original,fold_1,fold_5,fold_3,fold_2,fold_10,fold_9,fold_4,fold_6,fold_7,fold_8,fold_avg,fold_std
0,meyer,355,44,36,33,34,36,32,38,34,37,31,35.5,3.5
1,saramago,250,28,27,27,19,17,24,28,30,25,25,25.0,3.9
2,sheldon,206,19,15,21,26,25,18,17,20,24,21,20.6,3.4
3,orwell,184,12,19,19,20,21,20,19,16,13,25,18.4,3.6
4,amado,173,19,13,20,20,16,18,17,14,20,16,17.3,2.4
5,reboucas,140,13,20,13,13,17,14,12,17,12,9,14.0,3.0
6,salinger,127,8,13,11,11,13,17,12,14,12,16,12.7,2.5
7,total,1435,143,143,144,143,145,143,143,145,143,143,143.5,0.8


In [95]:
apa_final = utils.fold_summary(apa_final)
apa_final

Unnamed: 0,author,dataset_original,fold_1,fold_5,fold_3,fold_2,fold_10,fold_9,fold_4,fold_6,fold_7,fold_8,fold_avg,fold_std
0,meyer,598,74,75,59,41,50,55,79,57,64,44,59.8,12.4
1,sheldon,358,22,31,34,49,37,29,23,34,51,48,35.8,9.9
2,orwell,266,14,31,31,28,29,27,34,34,9,29,26.6,7.9
3,amado,240,37,14,21,34,26,31,17,16,19,25,24.0,7.5
4,saramago,203,25,11,24,13,23,21,17,21,23,25,20.3,4.7
5,salinger,165,14,10,15,21,18,21,10,20,17,19,16.5,3.9
6,reboucas,149,11,26,14,12,15,14,18,16,15,8,14.9,4.5
7,total,1979,197,198,198,198,198,198,198,198,198,198,197.9,0.3


In [96]:
rwa_final = utils.fold_summary(rwa_final)
rwa_final

Unnamed: 0,author,dataset_original,fold_1,fold_5,fold_3,fold_2,fold_10,fold_9,fold_4,fold_6,fold_7,fold_8,fold_avg,fold_std
0,meyer,493,66,59,47,40,43,45,66,47,45,35,49.3,10.1
1,sheldon,295,20,20,28,42,31,24,19,31,42,38,29.5,8.4
2,orwell,244,14,26,31,28,26,25,28,29,9,28,24.4,6.7
3,amado,204,27,14,17,29,22,25,17,14,16,23,20.4,5.2
4,saramago,180,20,10,20,13,21,19,15,21,19,22,18.0,3.8
5,salinger,144,11,8,15,18,14,20,10,17,15,16,14.4,3.6
6,reboucas,137,11,24,13,10,14,13,15,15,14,8,13.7,4.1
7,total,1697,169,161,171,180,171,171,170,174,160,170,169.7,5.5


In [97]:
data.to_csv('../datasets/processed/reli_stratified.csv', index=False)