In [1]:
import pandas as pd

from src import utils
from src import pre_processing

In [2]:
# lendo e pré-processando os dados (json to dataframe)
data = pre_processing.pre_processing_tv_dataset('datasets/tv/data.json')
data.head(1)

Unnamed: 0,tokens,aspect_tags
0,"[Excelente, smart, tv, ., E, foi, entregue, ve...","[0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [3]:
# contabilizando o número de aspectos por registro
data = utils.aspect_counter(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,num_aspects
0,"[Excelente, smart, tv, ., E, foi, entregue, ve...","[0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1


In [4]:
# definindo se um registro possui aspecto ou não
data = utils.has_aspect(data)
data.head(1)

Unnamed: 0,tokens,aspect_tags,num_aspects,has_aspect
0,"[Excelente, smart, tv, ., E, foi, entregue, ve...","[0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,1


In [5]:
# contabilizando a quantidade de registros por número de aspectos na frase do dataset original
num_aspects_summary = data.num_aspects.value_counts(normalize=True).reset_index().rename(
    columns={'num_aspects': 'original_dataset', 'index': 'num_aspects'})
num_aspects_summary

Unnamed: 0,num_aspects,original_dataset
0,2,0.310724
1,1,0.256645
2,3,0.190651
3,4,0.097159
4,0,0.093492
5,5,0.028414
6,6,0.013749
7,7,0.006416
8,8,0.000917
9,9,0.000917


In [6]:
# contabilizando a quantidade de registros com e sem aspectos do dataset original
aspects_summary = data.has_aspect.value_counts(normalize=True).reset_index().rename(
    columns={'index': 'has_aspect', 'has_aspect': 'original_dataset'})
aspects_summary

Unnamed: 0,has_aspect,original_dataset
0,1,0.906508
1,0,0.093492


In [7]:
# nome das colunas para estraficacão
y_col = 'has_aspect'
X_cols = ['tokens', 'aspect_tags', 'num_aspects']

In [8]:
# gerando os folds
data = utils.stratified_k_fold(data, X_cols, y_col, k=10)

In [9]:
# número de registros por fold
data.fold.value_counts().reset_index()

Unnamed: 0,index,fold
0,1,110
1,2,109
2,3,109
3,4,109
4,5,109
5,6,109
6,7,109
7,8,109
8,9,109
9,10,109


In [10]:
# lendo a base salva
data = pd.read_csv('datasets/tv_stratified.csv')

In [11]:
# gerando estatísticas dos folds
for fold in data.fold.unique():

    # filtrando
    curr = data[data.fold == fold]

    # contabilizando a quantidade de registros por número de aspectos na frase
    curr_num_aspects_summary = curr.num_aspects.value_counts(normalize=True).reset_index().rename(
        columns={'num_aspects': f'fold_{fold}', 'index': 'num_aspects'})
    num_aspects_summary = num_aspects_summary.merge(curr_num_aspects_summary, how='outer')

    # contabilizando a quantidade de registros com e sem aspectos
    curr_aspects_summary = curr.has_aspect.value_counts(normalize=True).reset_index().rename(
        columns={'has_aspect': f'fold_{fold}', 'index': 'has_aspect'})
    aspects_summary = aspects_summary.merge(curr_aspects_summary, how='outer')

In [12]:
# quantidade de registros por número de aspectos na frase
for col in num_aspects_summary.iloc[:, 1:].columns:
    num_aspects_summary[col] = num_aspects_summary[col] * 100
num_aspects_summary['fold_avg'] = num_aspects_summary.iloc[:, 2:].mean(axis=1)
num_aspects_summary['fold_std'] = num_aspects_summary.iloc[:, 2:].std(axis=1)
num_aspects_summary = round(num_aspects_summary, 2)
num_aspects_summary

Unnamed: 0,num_aspects,original_dataset,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,2,31.07,39.09,32.11,23.85,36.7,35.78,35.78,35.78,32.11,17.43,22.02,31.07,6.96
1,1,25.66,21.82,22.94,39.45,31.19,37.61,31.19,23.85,22.02,17.43,9.17,25.67,8.79
2,3,19.07,19.09,24.77,19.27,11.01,11.93,15.6,18.35,22.94,25.69,22.02,19.07,4.78
3,4,9.72,7.27,9.17,8.26,10.09,4.59,5.5,6.42,10.09,17.43,18.35,9.72,4.45
4,0,9.35,10.0,9.17,9.17,9.17,9.17,9.17,9.17,9.17,9.17,10.09,9.35,0.35
5,5,2.84,2.73,0.92,,0.92,0.92,2.75,5.5,2.75,5.5,6.42,3.16,2.03
6,6,1.37,,0.92,,0.92,,,0.92,,5.5,5.5,2.75,2.25
7,7,0.64,,,,,,,,0.92,0.92,4.59,2.14,1.73
8,8,0.09,,,,,,,,,0.92,,0.92,0.0
9,9,0.09,,,,,,,,,,0.92,0.92,0.0


In [13]:
# tem ou não aspecto
for col in aspects_summary.iloc[:, 1:].columns:
    aspects_summary[col] = aspects_summary[col] * 100
aspects_summary['fold_avg'] = aspects_summary.iloc[:, 2:].mean(axis=1)
aspects_summary['fold_std'] = aspects_summary.iloc[:, 2:].std(axis=1)
aspects_summary = round(aspects_summary, 2)
aspects_summary

Unnamed: 0,has_aspect,original_dataset,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,fold_avg,fold_std
0,1,90.65,90.0,90.83,90.83,90.83,90.83,90.83,90.83,90.83,90.83,89.91,90.65,0.35
1,0,9.35,10.0,9.17,9.17,9.17,9.17,9.17,9.17,9.17,9.17,10.09,9.35,0.35
