In [93]:
import csv
import pandas as pd
import numpy as np

In [94]:
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()


pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')



def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    stripped = re.sub(combined_pat, '', bom_removed)
    stripped = re.sub(www_pat, '', stripped)
    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()


def clean_str(string, test=True):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    words = string.replace('\"','').split(' ')
    for idx, word in enumerate(words):
        if word == '@USER' or word == 'URL':
            continue
        elif test and len(word) > 0 and word[0] == '@':
            words[idx] = '@USER'
            continue

        word = re.sub(r'^https?:\/\/.*', 'URL', word)
        word = re.sub(r"[^A-Za-z0-9()@,!?\'\`]", " ", word)
        word = re.sub(r"\'s", " \'s", word)
        word = re.sub(r"\'ve", " have", word)
        word = re.sub(r"n\'t", " not", word)
        word = re.sub(r"\'re", " are", word)
        word = re.sub(r"\'d", " \'d", word)
        word = re.sub(r"\'ll", " will", word)
        word = re.sub(r",", " , ", word)
        word = re.sub(r"!", " ! ", word)
        word = re.sub(r"\(", " \( ", word)
        word = re.sub(r"\)", " \) ", word)
        word = re.sub(r"\?", " \? ", word)
        word = re.sub(r"\s{2,}", " ", word)
        words[idx] = word.strip().lower()
    return ' '.join(words)



In [95]:
def convert_file_to_fast_format(input_name, output_name, header, tweet_index=1, task_index=2):
    labels = []
    with open(input_name) as csv_file:
        with open(output_name, 'w', newline='') as output_file:
            label_name = '__label__'
            csv_reader = csv.reader(csv_file, delimiter='\t')

            line_count = 0
            rows = []
            for row in csv_reader:
                if line_count == 0 and header:
                    print(f'Column names are {", ".join(row)}')
                    line_count += 1
                else:
                    tweet = tweet_cleaner(row[tweet_index].replace('"',''))
                    label = row[task_index].replace('"','')
                    if label != 'NULL':
                        labels.append(label)
                        to_save = f'{label_name}{label} {tweet}'
                        rows.append([to_save])


            fast_writer = csv.writer(output_file, delimiter=',')
            fast_writer.writerows(rows)
            return labels


In [96]:
import os
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import f1_score
from tqdm import tqdm
from shutil import copyfile


np.random.seed(seed=42)
def find_best_fast_params(labels, train_name, predicted_name, test_name, first_label='NOT', second_label='OFF', third_label=None, fourth_label=None, average='macro',):
    ground_truth = pd.DataFrame(np.array(labels), columns=['label'])
    mapping = {first_label: 0, second_label: 1}
    if third_label:
        mapping[third_label] = 2
    if fourth_label:
        mapping[fourth_label] = 3
    ground_truth = ground_truth.label.map(mapping)

    best_score = 0
    best_params = None
    param_grid = [{'epoch': [5, 10, 20, 30, 50], 'wordNgrams': [1,2,3,4,5], 'lr': [0.01, 0.05, 0.1, 0.5, 1], 'minCount': range(2, 10, 2),  'ws':  range(2, 10, 2), 'dim': [50,100]} ]
    params_list = ParameterGrid(param_grid)
    selected = np.random.choice(params_list, size = 300, replace=False)
    
    for param in tqdm(selected):
        epoch = param['epoch']
        wordNgrams = param['wordNgrams']
        lr = param['lr']
        minCount = param['minCount']
        ws = param['ws']
        dim = param['dim']
        os.system(f'./fastNext/fasttext supervised -input {train_name} -output not_trained -lr {lr} -ws {ws} -minCount {minCount} -epoch {epoch} -wordNgrams {wordNgrams} -dim {dim}')
        os.system(f'./fastNext/fasttext predict not_trained.bin {test_name} > {predicted_name}')
        predicted = pd.read_csv(predicted_name, header=None, names=['label'])
        predicted['label'] = predicted.label.str.replace('__label__','')
        predicted = predicted.label.map(mapping)

        score = f1_score(predicted, ground_truth, average=average)
        if score > best_score:
            best_score = score
            best_params = param
            copyfile('not_trained.bin', f'best_trained-{first_label}-{second_label}.bin')
            
    return (best_score, best_params)



In [5]:
convert_file_to_fast_format('./data/start-kit/training-v1/offenseval-training-v1.tsv', './data/train-fast-a.txt', True,1,2)
labels_1 = convert_file_to_fast_format('./data/start-kit/trial-data/offenseval-trial.txt', './data/test-fast-a.txt', False, 0, 1)

Column names are id, tweet, subtask_a, subtask_b, subtask_c


In [6]:
convert_file_to_fast_format('./data/start-kit/training-v1/offenseval-training-v1.tsv', './data/train-fast-b.txt', True,1,3)
labels_2 = convert_file_to_fast_format('./data/start-kit/trial-data/offenseval-trial.txt', './data/test-fast-b.txt', False, 0, 2)

Column names are id, tweet, subtask_a, subtask_b, subtask_c


In [7]:
convert_file_to_fast_format('./data/start-kit/training-v1/offenseval-training-v1.tsv', './data/train-fast-c.txt', True,1,4)
labels_3 = convert_file_to_fast_format('./data/start-kit/trial-data/offenseval-trial.txt', './data/test-fast-c.txt', False, 0, 3)

Column names are id, tweet, subtask_a, subtask_b, subtask_c


In [8]:
(best_score_a, best_params_a) = find_best_fast_params(labels_1, train_name='./data/train-fast-a.txt',test_name='./data/test-fast-a.txt', predicted_name='./data/predicted-fast-a.txt')

  'recall', 'true', average, warn_for)
100%|██████████| 300/300 [16:38<00:00,  2.73s/it]


In [9]:
(best_score_a, best_params_a)

(0.7849462365591398,
 {'dim': 100, 'epoch': 20, 'lr': 0.1, 'minCount': 4, 'wordNgrams': 3, 'ws': 4})

In [10]:
(best_score_b, best_params_b) = find_best_fast_params(labels_2, train_name='./data/train-fast-b.txt',test_name='./data/test-fast-b.txt', predicted_name='./data/predicted-fast-b.txt',first_label='UNT', second_label='TIN')

100%|██████████| 300/300 [14:32<00:00,  2.88s/it]


In [11]:
(best_score_b, best_params_b)

(0.6450354609929078,
 {'dim': 50, 'epoch': 10, 'lr': 1, 'minCount': 4, 'wordNgrams': 1, 'ws': 8})

In [12]:
(best_score_c, best_params_c) = find_best_fast_params(labels_3, train_name='./data/train-fast-c.txt',test_name='./data/test-fast-c.txt', predicted_name='./data/predicted-fast-c.txt',first_label='IND', second_label='OTH', third_label='GRP')

100%|██████████| 300/300 [14:07<00:00,  2.83s/it]


In [13]:
(best_score_c, best_params_c)

(0.42745098039215684,
 {'dim': 50, 'epoch': 20, 'lr': 0.01, 'minCount': 2, 'wordNgrams': 1, 'ws': 2})

In [147]:
def convert_file_to_fast_format(input_name, output_name):
    labels = []
    with open(input_name) as csv_file:
        with open(output_name, 'w', newline='') as output_file:
            label_name = '__label__'
            csv_reader = csv.reader(csv_file, delimiter='\t')
            rows = []
            for row in csv_reader:
                    tweet = tweet_cleaner(row[1].replace('"',''))
                    rows.append([tweet])


            fast_writer = csv.writer(output_file, delimiter=',')
            fast_writer.writerows(rows)
return convert_file_to_fast_format('./data/C/test_set_taskc.tsv', './formatted-c')

In [148]:
def add_ids(labels, ids):
    with open(labels, 'r') as textfile1, open(ids, 'r') as textfile2:
        rows = []
        no = 0 
#         print(textfile1.read())
        for x, y in zip(textfile1.readlines(), textfile2.readlines()):
            if no != 0:
                x = x.replace('__label__', '').strip()
                y = y.split('\t')[0].strip()
#                 combined = ', '.join([x,y]).strip('\n')
                rows.append([y, x])
            no = no + 1
    with open(labels, 'w') as output_file:
        fast_writer = csv.writer(output_file, delimiter=',')
        fast_writer.writerows(rows)
#         return rows
rows = add_ids('./tested-c.txt', './data/C/test_set_taskc.tsv')


In [90]:
rows[0]

'15923 NOT'