In [12]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from utils import *
import nlpcda
import os

from typing import List, Dict

In [24]:
class DataAugmentation:
    def __init__(self, configs:Dict[str, dict]) -> None:
        self.entity_swap, self.random_del = None, None

        if 'random_entity' in configs.keys():
            self.entity_swap_p = configs['random_entity'].pop('prop')
            self.entity_swap = nlpcda.Similarword(**(configs['random_entity']))
        if 'random_delete_char' in configs.keys():
            self.random_del_p = configs['random_delete_char'].pop('prop')
            self.random_del = nlpcda.RandomDeleteChar(**(configs['random_delete_char']))
    
    def aug(self, df_full:pd.DataFrame, permute=True, seed=1117) -> pd.DataFrame:
        df = df_full[df_full.label == 1]
        df_neg = df_full[df_full.label == 0]

        L = len(df)
        if self.entity_swap:
            augmented_df = self.aug_single(df, L, self.entity_swap_p, self.entity_swap)
        if self.random_del:
            augmented_df = self.aug_single(augmented_df, L, self.random_del_p, self.random_del)

        augmented_df = pd.concat((df_neg, augmented_df))
        if permute:
            augmented_df = augmented_df.sample(frac=1, random_state=seed).reset_index(drop=True)
        return augmented_df

    def aug_single(self, df:pd.DataFrame, L:int, p:float, tool) -> pd.DataFrame:
        """input L: original df length. Avoid augmentation on newly constructed data. """
        idx = np.random.choice(range(L), size=int(L*p))
        slice_df = df.iloc[idx]
        label, text = slice_df[['label', 'text']].values.T
        transformed_slice_df = self.get_transformed_df(slice_df, tool)
        augmented_df = pd.concat((df, transformed_slice_df))
        return augmented_df

    def text_seq_transform(self, tool, texts:List[str]) -> List[str]:
        out = []
        for text in texts:
            transformed_text = tool.replace(text)[-1]
            out.append(transformed_text)
        return np.array(out)
    
    def get_transformed_df(self, slice_df:pd.DataFrame, tool) -> pd.DataFrame:
        label, text = slice_df[['label', 'text']].values.T
        transformed_text = self.text_seq_transform(self.entity_swap, text)
        transformed_slice_df = slice_df.drop(columns=['text']).copy(deep=True)
        transformed_slice_df['text'] = transformed_text
        return transformed_slice_df
        

In [25]:
np.random.seed(1117)
rnd_idx = np.random.choice(range(1, 43001), size=1000)

train_df = pd.read_csv('../data/train.csv', sep='\t')    #.iloc[rnd_idx]
test_df = pd.read_csv('../data/test.csv', sep='\t')

train_df.drop(columns=['id'], inplace=True)

In [26]:
entities_file = os.path.join("D:\Apps\Anaconda3\envs\general-torch\Lib\site-packages", "nlpcda\data\entities.txt")

da_configs = {
    'random_entity':{
        'base_file':entities_file, 
        'create_num':2, 
        'change_rate':0.5, 
        'seed':1024, 
        'prop':0.3, 
    }, 
    'random_delete_char':{
        'create_num':2, 
        'change_rate':0.05, 
        'seed':1024, 
        'prop':0.1, 
    }
}

da = DataAugmentation(da_configs)
train_df_aug = da.aug(train_df)

ntf()

load :D:\Apps\Anaconda3\envs\general-torch\Lib\site-packages\nlpcda\data\entities.txt done


In [30]:
len(train_df_aug[train_df_aug.label == 0]) / len(train_df_aug[train_df_aug.label == 1])

0.24324095620901207

In [13]:
train_df

Unnamed: 0,id,label,text
0,1,1,通过大力发展社区教育，使我省全民终身学习的教育体系已深入人心。
1,2,1,再次投入巨资的英超劲旅曼城队能否在2010-2011年度的英超联赛中夺得英超冠军，曼联、切尔...
2,3,1,广西居民纸质图书的阅读率偏低，手机阅读将成为了广西居民极倾向的阅读方式。
3,4,1,文字书写时代即将结束，预示着人与字之间最亲密的一种关系已经终结。与此同时，屏幕文化造就了另一...
4,5,1,安徽合力公司2006年叉车销售强劲，销售收入涨幅很有可能将超过40%以上。公司预计2006年...
...,...,...,...
45258,26096-1,1,他回忆说，小时候在北京，那个时候其实沙尘也很大，戴着口罩骑车去上学。
45259,26096-2,0,到学校之后，口罩上都是厚厚的黄沙子。
45260,26096-3,0,到了冬天，加上煤烟气，情况就更糟了，那个时候没有PM2．5，但是有PM250。
45261,41597-0,0,2005年11月27日，龙煤矿业集团有限责任公司东风煤矿发生一起特大煤尘爆炸事故，死亡171...


In [16]:
entities_file = os.path.join("D:\Apps\Anaconda3\envs\general-torch\Lib\site-packages", "nlpcda\data\entities.txt")

smw = nlpcda.Similarword(base_file=entities_file, create_num=2, change_rate=0.3, seed=1024)
dw = nlpcda.RandomDeleteChar(create_num=2, change_rate=0.05, seed=1024)

load :D:\Apps\Anaconda3\envs\general-torch\Lib\site-packages\nlpcda\data\entities.txt done


In [26]:
i = 93

text = train_df.iloc[i:i+5].text
print(f'label = {train_df.iloc[i].label}')
print(text)
print(smw.replace(text)[-1])
print(dw.replace(text)[-1])

label = 0
93    航天科学家曾希望这种撞击会激起六英里高的月球尘埃和碎片云，通过对它们扫描可以找到水冰的证据，...
94    建设部要求，各地要把风景名胜资源保护工作放在极其重要的位置，采取切实有效的措施，保护风景名胜...
95         萧山区青年歌手大赛成为全区百姓最关注的本土声乐大赛，每届都吸引着数百位歌唱爱好者的参与。
96         中国古代小说有其自身的特点，重故事，重描写，与西方小说和现代某些中国小说重心理刻画不同。
97    《不动产登记暂行条例》按照物权法的有关规定，把登记资料查询人限定在权利人和利害关系人，有关国...
Name: text, dtype: object


AttributeError: 'Series' object has no attribute 'strip'

In [None]:
def postprocess_ds(outputs:List[List[dict]]):
    entity_vocab = {}
    for output in outputs:
        if output:
            sentence_vocab = postprocess_sentence(output)
            for k, v in sentence_vocab.items():
                if k in entity_vocab.keys():
                    entity_vocab[k].extend(v)
                else:
                    entity_vocab[k] = v
    return entity_vocab

def postprocess_sentence(ner_outputs:List[dict]):
    entity_vocab = {}
    if ner_outputs == []:
        return

    current = ''
    for out in ner_outputs:
        if out['entity'][0] == 'B':
            if current:
                if category in entity_vocab.keys() and current not in entity_vocab[category]:
                    entity_vocab[category].append(current)
                else:
                    entity_vocab[category] = [current]
                current = ''
            category = out['entity'][2:]
            current += out['word']
        if out['entity'][0] == 'I':
            if not current:
                continue
            current += out['word']
    if current:
        if category in entity_vocab.keys() and current not in entity_vocab[category]:
            entity_vocab[category].append(current)
        else:
            entity_vocab[category] = [current]
    return entity_vocab

In [9]:
postprocess_ds(ner_outputs)['scene']

NameError: name 'postprocess_ds' is not defined