In [1]:
import pandas as pd

from src import utils
from src import pre_processing

In [2]:
# lendo e pré-processando os dados (json to dataframe)
data = pre_processing.pre_processing_tv_dataset('../datasets/tv/data.json')
data.head(1)

Unnamed: 0,tokens,aspect_tags
0,"[Excelente, smart, tv, ., E, foi, entregue, ve...","[0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [3]:
# contabilizando o número de aspectos por registro
data = utils.aspect_counter(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,total_aspects
0,"[Excelente, smart, tv, ., E, foi, entregue, ve...","[0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1


In [4]:
# definindo se um registro possui aspecto ou não
data = utils.has_aspect(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,total_aspects,has_aspect
0,"[Excelente, smart, tv, ., E, foi, entregue, ve...","[0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,1


In [5]:
# review
data['review'] = data.index + 1

In [6]:
# contabilizando reviews com aspects
has_aspect = data.has_aspect.value_counts().reset_index().rename(
    columns={'has_aspect': 'original_dataset', 'index': 'has_aspect'})
has_aspect

Unnamed: 0,has_aspect,original_dataset
0,1,989
1,0,102


In [7]:
# nome das colunas para estraficacão
y_col = 'has_aspect'
X_cols = ['tokens', 'aspect_tags', 'total_aspects']
groups = 'review'

In [8]:
# gerando os folds
stratified_data = utils.stratified_k_fold(data, X_cols, y_col, groups, k=10)

In [9]:
# salvando
# data.to_csv('../datasets/processed/tv_stratified.csv', index=False)

In [10]:
# lendo a base salva
data = pd.read_csv('../datasets/processed/tv_stratified.csv')

In [11]:
# gerando estatísticas dos folds
for fold in data.fold.unique():

    # filtrando
    curr = data[data.fold == fold]

    # contabilizando a quantidade de registros por número de aspectos na frase
    curr_has_aspect = curr.has_aspect.value_counts().reset_index().rename(
        columns={'has_aspect': f'fold_{fold}', 'index': 'has_aspect'})
    has_aspect = has_aspect.merge(curr_has_aspect, how='outer')

In [12]:
# quantidade de registros por número de aspectos na frase
has_aspect = utils.fold_summary(has_aspect)
has_aspect

Unnamed: 0,has_aspect,original_dataset,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,1,989,99,99,99,99,99,99,99,99,99,98,98.9,0.3
1,0,102,11,10,10,10,10,10,10,10,10,11,10.2,0.4
2,total,1091,110,109,109,109,109,109,109,109,109,109,109.1,0.3


In [13]:
# número de reviews por fold
reviews_per_fold = data.fold.value_counts().reset_index().rename(columns={'fold': 'reviews', 'index': 'fold'}).sort_values('fold')
reviews_per_fold

Unnamed: 0,fold,reviews
0,1,110
1,2,109
2,3,109
3,4,109
4,5,109
5,6,109
6,7,109
7,8,109
8,9,109
9,10,109


In [14]:
# número de aspectos por partićão
def aspect_per_fold(df):
    apf = utils.summary(df, 'fold', 'total_aspects', 'sum').sort_values('fold')
    total = [apf.total_aspects.sum()]
    std = [apf.total_aspects.std()]

    for value, name in zip([total, std], ['total', 'std']):
        apf.loc[len(apf)] = [name] + value
    return round(apf, 1)

In [15]:
# número de aspectos por partićão
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,220.0
1,2,227.0
2,3,194.0
3,4,205.0
4,5,183.0
5,6,202.0
6,7,228.0
7,8,235.0
8,9,298.0
9,10,335.0


Balancear o número de aspectos por particão.

In [16]:
# coluna de review
data['review'] = data.index + 1

In [17]:
def rebalance(df, n, fold_from, fold_to):
    count = n
    while count > 0:
        tmp = df[df.fold == fold_from].groupby(['review']).agg({'total_aspects': 'sum'}).reset_index()
        tmp = tmp[(tmp.total_aspects > 0) & (tmp.total_aspects <= count)].sample(frac=0.1)
        review = tmp.review.values[-1]
        count -= tmp.total_aspects.values[-1]
        df.loc[df.review == review, 'fold'] = fold_to
    return df

In [18]:
# do fold 10 para o 5
data = rebalance(data, 49, 10, 5)

# número de aspectos por partićão
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,220.0
1,2,227.0
2,3,194.0
3,4,205.0
4,5,232.0
5,6,202.0
6,7,228.0
7,8,235.0
8,9,298.0
9,10,286.0


In [19]:
# do fold 9 para o 3
data = rebalance(data, 38, 9, 3)

# número de aspectos por partićão
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,220.0
1,2,227.0
2,3,232.0
3,4,205.0
4,5,232.0
5,6,202.0
6,7,228.0
7,8,235.0
8,9,260.0
9,10,286.0


In [20]:
# do fold 9 para o 6
data = rebalance(data, 28, 9, 6)

# número de aspectos por partićão
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,220.0
1,2,227.0
2,3,232.0
3,4,205.0
4,5,232.0
5,6,230.0
6,7,228.0
7,8,235.0
8,9,232.0
9,10,286.0


In [21]:
# do fold 10 para o 4
data = rebalance(data, 27, 10, 4)

# número de aspectos por partićão
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,220.0
1,2,227.0
2,3,232.0
3,4,232.0
4,5,232.0
5,6,230.0
6,7,228.0
7,8,235.0
8,9,232.0
9,10,259.0


In [22]:
# do fold 10 para o 1
data = rebalance(data, 12, 10, 1)

# número de aspectos por partićão
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,232.0
1,2,227.0
2,3,232.0
3,4,232.0
4,5,232.0
5,6,230.0
6,7,228.0
7,8,235.0
8,9,232.0
9,10,247.0


In [23]:
# do fold 10 para o 2
data = rebalance(data, 5, 10, 2)

# número de aspectos por partićão
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,232.0
1,2,232.0
2,3,232.0
3,4,232.0
4,5,232.0
5,6,230.0
6,7,228.0
7,8,235.0
8,9,232.0
9,10,242.0


In [24]:
# do fold 10 para o 6
data = rebalance(data, 2, 10, 6)

# número de aspectos por partićão
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,232.0
1,2,232.0
2,3,232.0
3,4,232.0
4,5,232.0
5,6,232.0
6,7,228.0
7,8,235.0
8,9,232.0
9,10,240.0


In [26]:
# do fold 10 para o 7
data = rebalance(data, 4, 10, 7)

# número de aspectos por partićão
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,232.0
1,2,232.0
2,3,232.0
3,4,232.0
4,5,232.0
5,6,232.0
6,7,235.0
7,8,235.0
8,9,232.0
9,10,233.0


In [27]:
# do fold 10 para o 7
data = rebalance(data, 1, 7, 1)

# número de aspectos por partićão
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,233.0
1,2,232.0
2,3,232.0
3,4,232.0
4,5,232.0
5,6,232.0
6,7,234.0
7,8,235.0
8,9,232.0
9,10,233.0


In [28]:
# do fold 10 para o 7
data = rebalance(data, 1, 7, 2)

# número de aspectos por partićão
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,233.0
1,2,233.0
2,3,232.0
3,4,232.0
4,5,232.0
5,6,232.0
6,7,233.0
7,8,235.0
8,9,232.0
9,10,233.0


In [29]:
# do fold 10 para o 7
data = rebalance(data, 1, 8, 3)

# número de aspectos por partićão
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,233.0
1,2,233.0
2,3,233.0
3,4,232.0
4,5,232.0
5,6,232.0
6,7,233.0
7,8,234.0
8,9,232.0
9,10,233.0


In [30]:
# do fold 10 para o 7
data = rebalance(data, 1, 8, 4)

# número de aspectos por partićão
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,233.0
1,2,233.0
2,3,233.0
3,4,233.0
4,5,232.0
5,6,232.0
6,7,233.0
7,8,233.0
8,9,232.0
9,10,233.0


In [34]:
# número de reviews por fold
def reviews_per_fold(df):
    return df.fold.value_counts().reset_index().rename(columns={'fold': 'reviews', 'index': 'fold'}).sort_values('fold')
rpf_stats = reviews_per_fold(data)
rpf_stats

Unnamed: 0,fold,reviews
4,1,115
5,2,112
0,3,122
2,4,121
1,5,122
3,6,118
6,7,109
7,8,107
8,9,89
9,10,76


In [48]:
def rebalance_reviews(df, z, n, fold_from, fold_to):
    tmp = df[(df.total_aspects == z) & (df.fold == fold_from)].sample(frac=1.).head(n)
    df.loc[df.review.isin(tmp.review.values), 'fold'] = fold_to
    return df

In [38]:
# rebalanceando o número de reviews
data = rebalance_reviews(data, 0, 5, 1, 10)
rpf_stats = reviews_per_fold(data)
rpf_stats

Unnamed: 0,fold,reviews
5,1,110
4,2,112
0,3,122
2,4,121
1,5,122
3,6,118
6,7,109
7,8,107
8,9,89
9,10,81


In [39]:
# rebalanceando o número de reviews
data = rebalance_reviews(data, 0, 3, 2, 10)
rpf_stats = reviews_per_fold(data)
rpf_stats

Unnamed: 0,fold,reviews
4,1,110
5,2,109
0,3,122
2,4,121
1,5,122
3,6,118
6,7,109
7,8,107
8,9,89
9,10,84


In [40]:
# rebalanceando o número de reviews
data = rebalance_reviews(data, 0, 13, 3, 9)
rpf_stats = reviews_per_fold(data)
rpf_stats

Unnamed: 0,fold,reviews
4,1,110
5,2,109
3,3,112
1,4,121
0,5,122
2,6,118
6,7,109
7,8,107
8,9,99
9,10,84


In [42]:
# rebalanceando o número de reviews
data = rebalance_reviews(data, 0, 10, 4, 9)
rpf_stats = reviews_per_fold(data)
rpf_stats

Unnamed: 0,fold,reviews
4,1,110
5,2,109
2,3,112
3,4,111
0,5,122
1,6,118
7,7,109
8,8,107
6,9,109
9,10,84


In [45]:
# rebalanceando o número de reviews
data = rebalance_reviews(data, 0, 13, 5, 10)
rpf_stats = reviews_per_fold(data)
rpf_stats

Unnamed: 0,fold,reviews
4,1,110
5,2,109
1,3,112
3,4,111
2,5,112
0,6,118
7,7,109
8,8,107
6,9,109
9,10,94


In [47]:
# rebalanceando o número de reviews
data = rebalance_reviews(data, 0, 9, 6, 10)
rpf_stats = reviews_per_fold(data)
rpf_stats

Unnamed: 0,fold,reviews
3,1,110
4,2,109
0,3,112
2,4,111
1,5,112
6,6,109
7,7,109
8,8,107
5,9,109
9,10,103


In [49]:
# rebalanceando o número de reviews
data = rebalance_reviews(data, 1, 3, 3, 10)
rpf_stats = reviews_per_fold(data)
rpf_stats

Unnamed: 0,fold,reviews
2,1,110
3,2,109
4,3,109
1,4,111
0,5,112
6,6,109
7,7,109
8,8,107
5,9,109
9,10,106


In [50]:
# rebalanceando o número de reviews
data = rebalance_reviews(data, 1, 2, 4, 10)
rpf_stats = reviews_per_fold(data)
rpf_stats

Unnamed: 0,fold,reviews
1,1,110
2,2,109
3,3,109
5,4,109
0,5,112
6,6,109
7,7,109
9,8,107
4,9,109
8,10,108


In [51]:
# rebalanceando o número de reviews
data = rebalance_reviews(data, 1, 2, 5, 8)
rpf_stats = reviews_per_fold(data)
rpf_stats

Unnamed: 0,fold,reviews
0,1,110
2,2,109
3,3,109
5,4,109
1,5,110
7,6,109
8,7,109
6,8,109
4,9,109
9,10,108


In [52]:
# rebalanceando o número de reviews
data = rebalance_reviews(data, 1, 1, 5, 10)
rpf_stats = reviews_per_fold(data)
rpf_stats

Unnamed: 0,fold,reviews
0,1,110
2,2,109
3,3,109
5,4,109
6,5,109
8,6,109
9,7,109
7,8,109
4,9,109
1,10,109


In [54]:
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,233.0
1,2,233.0
2,3,230.0
3,4,231.0
4,5,229.0
5,6,232.0
6,7,233.0
7,8,235.0
8,9,232.0
9,10,239.0


In [55]:
# rebalanceando o número de reviews
data = rebalance_reviews(data, 4, 1, 10, 5)
rpf_stats = reviews_per_fold(data)
rpf_stats

Unnamed: 0,fold,reviews
0,1,110
2,2,109
3,3,109
5,4,109
1,5,110
7,6,109
8,7,109
6,8,109
4,9,109
9,10,108


In [56]:
apf_stats = aspect_per_fold(data)
apf_stats

Unnamed: 0,fold,total_aspects
0,1,233.0
1,2,233.0
2,3,230.0
3,4,231.0
4,5,233.0
5,6,232.0
6,7,233.0
7,8,235.0
8,9,232.0
9,10,235.0


In [57]:
data.to_csv('../datasets/processed/tv_stratified.csv', index=False)