In [1]:
import pandas as pd

from src import utils
from src import pre_processing

In [2]:
# lendo e pré-processando os dados (json to dataframe)
data = pre_processing.pre_processing_tv_dataset('../datasets/tv/data.json')
data.head(1)

Unnamed: 0,tokens,aspect_tags
0,"[Excelente, smart, tv, ., E, foi, entregue, ve...","[0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [3]:
# contabilizando o número de aspectos por registro
data = utils.aspect_counter(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,total_aspects
0,"[Excelente, smart, tv, ., E, foi, entregue, ve...","[0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1


In [4]:
# definindo se um registro possui aspecto ou não
data = utils.has_aspect(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,total_aspects,has_aspect
0,"[Excelente, smart, tv, ., E, foi, entregue, ve...","[0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,1


In [5]:
# review
data['review'] = data.index + 1

In [6]:
# contabilizando reviews com aspects
has_aspect = data.has_aspect.value_counts().reset_index().rename(
    columns={'has_aspect': 'original_dataset', 'index': 'has_aspect'})
has_aspect

Unnamed: 0,has_aspect,original_dataset
0,1,989
1,0,102


In [7]:
# nome das colunas para estraficacão
y_col = 'has_aspect'
X_cols = ['tokens', 'aspect_tags', 'total_aspects']
groups = 'review'

In [8]:
# gerando os folds
data = utils.stratified_k_fold(data, X_cols, y_col, groups, k=10)

In [9]:
# salvando
# data.to_csv('../datasets/processed/tv_stratified.csv', index=False)

In [10]:
# lendo a base salva
data = pd.read_csv('../datasets/processed/tv_stratified.csv')

In [11]:
# gerando estatísticas dos folds
for fold in data.fold.unique():

    # filtrando
    curr = data[data.fold == fold]

    # contabilizando a quantidade de registros por número de aspectos na frase
    curr_has_aspect = curr.has_aspect.value_counts().reset_index().rename(
        columns={'has_aspect': f'fold_{fold}', 'index': 'has_aspect'})
    has_aspect = has_aspect.merge(curr_has_aspect, how='outer')

In [12]:
# quantidade de registros por número de aspectos na frase
has_aspect = utils.fold_summary(has_aspect)
has_aspect

Unnamed: 0,has_aspect,original_dataset,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,1,989,99,99,99,99,99,99,99,99,99,98,98.9,0.3
1,0,102,11,10,10,10,10,10,10,10,10,11,10.2,0.4
2,total,1091,110,109,109,109,109,109,109,109,109,109,109.1,0.3


In [13]:
# número de reviews por fold
reviews_per_fold = data.fold.value_counts().reset_index().rename(columns={'fold': 'reviews', 'index': 'fold'}).sort_values('fold')
reviews_per_fold

Unnamed: 0,fold,reviews
0,1,110
1,2,109
2,3,109
3,4,109
4,5,109
5,6,109
6,7,109
7,8,109
8,9,109
9,10,109


In [14]:
# número de aspectos por partićão
aspects_per_fold = utils.summary(data, 'fold', 'total_aspects', 'sum').sort_values('fold')
total = [aspects_per_fold.total_aspects.sum()]
std = [aspects_per_fold.total_aspects.std()]

for value, name in zip([total, std], ['total', 'std']):
    aspects_per_fold.loc[len(aspects_per_fold)] = [name] + value
aspects_per_fold = round(aspects_per_fold, 1)
aspects_per_fold

Unnamed: 0,fold,total_aspects
0,1,220.0
1,2,227.0
2,3,194.0
3,4,205.0
4,5,183.0
5,6,202.0
6,7,228.0
7,8,235.0
8,9,298.0
9,10,335.0
