#  Data Distribution

In [1]:
import csv

In [2]:
file_path = "../data/train.csv"
csv_file = open(file_path)
csv_reader = csv.DictReader(csv_file)

class_counter = {'toxic': 0, 'severe_toxic': 0, 'obscene': 0, 'threat': 0,
                 'insult': 0, 'identity_hate': 0}
normal_comment_counter = 0
global_counter = 0

for line in csv_reader:
    is_toxic = False
    for key in class_counter.keys():
        if line[key] == '1':
            class_counter[key] += 1
            is_toxic = True
    if not is_toxic:
        normal_comment_counter += 1
    global_counter += 1
    
print("total lines:{}, normal lines:{} ({:.2f}%)".format(global_counter, normal_comment_counter, normal_comment_counter / global_counter * 100))
for key, value in class_counter.items():
    print("{}: {} ({:.2f}%)".format(key, value, value / global_counter * 100))

csv_file.close()

total lines:159571, normal lines:143346 (89.83%)
toxic: 15294 (9.58%)
severe_toxic: 1595 (1.00%)
obscene: 8449 (5.29%)
threat: 478 (0.30%)
insult: 7877 (4.94%)
identity_hate: 1405 (0.88%)


# Data Upsampling

In [3]:
upsample_classes = {
    'toxic': 1,
    'severe_toxic': 5, 
    'obscene': 1,
    'threat': 20,
    'insult': 1,
    'identity_hate': 5    
}
output_file_path = "../data/upsample_train.csv"

with open(file_path) as input_csv_file, open(output_file_path, 'w') as output_csv_file:
    csv_reader = csv.DictReader(input_csv_file)
    csv_writer = csv.DictWriter(output_csv_file, fieldnames=csv_reader.fieldnames)
    csv_writer.writeheader()
    for line in csv_reader:
        max_upsample_time = 1
        for cls, times in upsample_classes.items():
            if line[cls] == '1':
                max_upsample_time = max(max_upsample_time, times)
        for _ in range(max_upsample_time):
            csv_writer.writerow(line)

# Data Generation

In [13]:
from collections import Counter, defaultdict

def create_lm(data):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    # 统计3-grams
    for line in data:
        for i in range(len(line)-2):
            w1 = line[i]
            w2 = line[i+1]
            w3 = line[i+2]

            model[(w1, w2)][w3] += 1

    # 把计数转换成概率
    for prefix in model:
        total_count = float(sum(model[prefix].values()))
        for word in model[prefix]:
            model[prefix][word] /= total_count
    
    return model

In [14]:
def generate_a_sentence(model):
    text = ['<START>', '<START>']
    sentence_finished = False

    while not sentence_finished:
        # 设定一个随机阈值 (增加文本多样性)
        r = random.random()
        accumulator = .0

        for word in model[tuple(text[-2:])].keys():
            accumulator += model[tuple(text[-2:])][word]
            # 当累加概率超过阈值时，选择该词汇
            if accumulator >= r:
                if word == '<END>':
                    sentence_finished = True
                else:
                    text.append(word)
                break
        
        if len(text) > 64:
            break

    sentence = ' '.join([t for t in text[2:] if t])
    return sentence

In [17]:
import random 

class_collection = {
    'toxic': [],
    'severe_toxic': [], 
    'obscene': [],
    'threat': [],
    'insult': [],
    'identity_hate': []
}    

generated_example_size = {cls: class_counter[cls] * upsample_classes[cls] - 1 for cls in class_collection.keys()}    

cls_data_generator_dict = {}

with open(file_path) as input_csv_file:
    csv_reader = csv.DictReader(input_csv_file)\
    
    for line in csv_reader:
        for cls in class_collection.keys():
            if line[cls] == '1':
                class_collection[cls].append(['<START>', '<START>'] + line['comment_text'].strip().split() + ['<END>'])
    
    for cls, data in class_collection.items():
        model = create_lm(data)
        cls_data_generator_dict[cls] = model


In [22]:
test_example = {
    'id': '',
    'comment_text': '',
}

for cls in class_collection.keys():
    output_file_path = "../data/argumented_{}.csv".format(cls)
    with open(output_file_path, 'w') as output_csv_file:
        csv_writer = csv.DictWriter(output_csv_file, fieldnames=['id', 'comment_text'])
        csv_writer.writeheader()
        model = cls_data_generator_dict[cls]
        for i in range(generated_example_size[cls]):
            sentence = generate_a_sentence(model)
            new_example = test_example.copy()
            new_example['id'] = '{}_{}'.format(cls, i)
            new_example['comment_text'] = sentence
            csv_writer.writerow(new_example)

# 通过原始模型预测其它分类的概率

# .... 

# 合并数据

In [26]:
cls_list = ['severe_toxic', 'threat', 'identity_hate']
output_file_path = "../data/argumented_train.csv"

empty_example = {
    'id': '',
    'comment_text': '',
    'toxic': 0,
    'severe_toxic': 0, 
    'obscene': 0,
    'threat': 0,
    'insult': 0,
    'identity_hate': 0
}

with open(output_file_path, 'a') as output_csv_file:
    csv_writer = csv.DictWriter(output_csv_file, fieldnames=csv_reader.fieldnames)
    for cls in cls_list:
        cls_text_file = open("../data/argumented_{}.csv".format(cls))
        cls_text_file_csv_reader = csv.DictReader(cls_text_file)
        cls_prediction_file = open("../data/{}_submission.csv".format(cls))
        cls_pred_csv_reader  = csv.DictReader(cls_prediction_file)
        for text_line, pred_line in zip(cls_text_file_csv_reader, cls_pred_csv_reader):
            assert text_line['id'] == pred_line['id']
            new_example = empty_example.copy()
            new_example['id'] = text_line['id']
            new_example['comment_text'] = text_line['comment_text']
            for label in class_collection.keys():
                if float(pred_line[label]) > 0.5:
                    new_example[label] = 1
                else:
                    new_example[label] = 0
            new_example[cls] = 1
            csv_writer.writerow(new_example)
        cls_text_file.close()
        cls_prediction_file.close()