# OLID Clean Code


In [1]:
# Imports

import numpy as np
import pandas as pd
import csv
from tqdm import tqdm
from sklearn.utils import shuffle
import argparse

import random
from textblob import TextBlob
from nltk.tokenize import TweetTokenizer

import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
nltk.download("stopwords")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kcava\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
class DataReader:
    def __init__(self, folder="../Dataset-OLID/OLIDv1.0/", 
                 task_a="data_subtask_a.csv"):
        self.folder = folder
        self.task_a = task_a
        
    def get_df_train_data(self):
        train_data = pd.read_csv(self.folder + self.task_a)
        train_tweets = train_data.drop(["Unnamed: 0", "id", "subtask_a"], axis=1)
        return train_tweets
    
    def get_df_data(self, file="data_subtask_a.csv"):
        data = pd.read_csv(self.folder + file)
        train_tweets = data.drop(["Unnamed: 0", "id", "subtask_a"], axis=1)
        return train_tweets
    
    def get_np_data_and_labels(self, file="data_subtask_a.csv"):
        tweets = self.get_df_data(file)
        data, labels = tweets.values[:,0], tweets.values[:,1]
        return data, labels
    
    # this creates copies
    def shuffle_np(self, data, labels):
        assert len(data) == len(labels)
        p = np.random.permutation(len(data))
        return data[p], labels[p]

In [5]:
class Preprocessor:

    def no_preprocessing(self, data, verbose=False):
        return data

    def remove_punctuation(self, data, verbose=False):
        for i in range(len(data)):
            if verbose:
                print(data[i])
            
            # Remove punctuation
            sentence_blob = TextBlob(data[i])
            sentence = " ".join(sentence_blob.words)
            data[i] = sentence.copy()
        return data
    
    def remove_stopwords_and_punctuation(self, data, verbose = False):
        from nltk.corpus import stopwords
        import re

        stop = stopwords.words("english")
        stop.append("’")
        
        tknzr = TweetTokenizer()
        
        if verbose:
            print(type(stop))
            print(stop)
        noise = ["user"]
        for i in range(len(data)):
            if verbose:
                print(data[i])
            
            # Remove punctuation
            #sentence_blob = TextBlob(data[i])
            sentence_blob = tknzr.tokenize(data[i])
            #print("Blob: ", sentence_blob)
            sentence = " ".join(sentence_blob) #.words)
            #print(sentence)
            words = sentence.split()
            #words = data[i].split()
            
            #Remove stopwords
            if verbose:
                print(words)
            clean_words = []
            
            for word in words:
                word = word.strip().lower()
                if verbose:
                    print(word)
                if word not in stop: 
                    clean_words.append(word)
                else: 
                    if verbose:
                        print("Remove: ", word)
            
            data[i] = " ".join(clean_words)
            if verbose:
                print(data[i])
                print("-"*20)
        return data
    
    def remove_stopwords_and_punctuation_textblob(self, data, verbose = False):
        from nltk.corpus import stopwords
        import re

        stop = stopwords.words("english")
        stop.append("’")
        
        if verbose:
            print(type(stop))
            print(stop)
        noise = ["user"]
        for i in range(len(data)):
            if verbose:
                print(data[i])
            
            # Remove punctuation
            sentence_blob = TextBlob(data[i])
            sentence = " ".join(sentence_blob.words)
            #print(sentence)
            words = sentence.split()
            
            #Remove stopwords
            if verbose:
                print(words)
            clean_words = []
            
            for word in words:
                word = word.strip().lower()
                if verbose:
                    print(word)
                if word not in stop: 
                    clean_words.append(word)
                else: 
                    if verbose:
                        print("Remove: ", word)
            
            data[i] = " ".join(clean_words)
            if verbose:
                print(data[i])
                print("-"*20)
        return data



In [6]:
def get_train_val(messages, labels, args):

    VOCAB_SIZE = args.vocab_size
    MAX_LENGTH = args.max_len
    TRUNC_TYPE = args.trunc_type
    PADDING_TYPE = args.pad_type
    OOV_TOK = args.oov_tok
    TRAINING_PORTION = args.train_portion

    train_number = int(len(messages) * TRAINING_PORTION)

    train_msgs = messages[:train_number]
    train_labels = labels[:train_number]
    val_msgs = messages[train_number:]
    val_labels = labels[train_number:]

    tokenizer = Tokenizer(num_words = VOCAB_SIZE, oov_token = OOV_TOK)
    tokenizer.fit_on_texts(train_msgs)
    word_index = tokenizer.word_index

    print("len(msgs) = {}; len(labels) = {}".format(len(messages), len(labels)))
    print("TRAIN: len(x) = {}; len(y) = {}".format(len(train_msgs),len(train_labels)))
    print("TEST: len(x) = {}; len(y) = {}".format(len(val_msgs),len(val_labels)))


    print("\nlen(word_index) = {}\n".format(len(word_index))) 
    # Total number of words without stopwords = 8029
    #print(word_index)

    train_sequences = tokenizer.texts_to_sequences(train_msgs)
    train_padded = pad_sequences(train_sequences, maxlen = MAX_LENGTH, 
                                padding = PADDING_TYPE, truncating = TRUNC_TYPE)

    val_sequences = tokenizer.texts_to_sequences(val_msgs)
    val_padded = pad_sequences(val_sequences, maxlen = MAX_LENGTH, 
                                padding = PADDING_TYPE, truncating = TRUNC_TYPE)

    print(type(val_padded))
    
    return train_padded, train_labels, val_padded, val_labels

## Models

In [8]:
def basic_model(args):
    VOCAB_SIZE = args.vocab_size
    EMBEDDING_DIM = args.emb_dim
    MAX_LENGTH = args.max_len
    NUM_EPOCHS = args.num_epochs

    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(24, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    return model