# Train Deep Learning Networks on OLID Dataset - 2

In [34]:
# Imports

import numpy as np
import pandas as pd
import csv
from tqdm import tqdm
from sklearn.utils import shuffle

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

## Reading data

In [31]:
class DataReader:
    def __init__(self, folder="../Dataset-OLID/OLIDv1.0/", 
                 task_a="data_subtask_a.csv"):
        self.folder = folder
        self.task_a = task_a
        
    def get_df_train_data(self):
        train_data = pd.read_csv(self.folder + self.task_a)
        train_tweets = train_data.drop(["Unnamed: 0", "id", "subtask_a"], axis=1)
        return train_tweets
    
    def get_df_data(self, file="data_subtask_a.csv"):
        data = pd.read_csv(self.folder + file)
        train_tweets = data.drop(["Unnamed: 0", "id", "subtask_a"], axis=1)
        return train_tweets
    
    def get_np_data_and_labels(self, file="data_subtask_a.csv"):
        tweets = self.get_df_data(file)
        data, labels = tweets.values[:,0], tweets.values[:,1]
        return data, labels
    
    # this creates copies
    def shuffle_np(self, data, labels):
        assert len(data) == len(labels)
        p = np.random.permutation(len(data))
        return data[p], labels[p]
        

In [10]:
dr = DataReader()
train_tweets = dr.get_df_train_data()
train_tweets.head(10)


Unnamed: 0,tweet,label_a
0,@USER She should ask a few native Americans wh...,1
1,@USER @USER Go home you’re drunk!!! @USER #MAG...,1
2,Amazon is investigating Chinese employees who ...,0
3,"@USER Someone should'veTaken"" this piece of sh...",1
4,@USER @USER Obama wanted liberals &amp; illega...,0
5,@USER Liberals are all Kookoo !!!,1
6,@USER @USER Oh noes! Tough shit.,1
7,@USER was literally just talking about this lo...,1
8,@USER Buy more icecream!!!,0
9,@USER Canada doesn’t need another CUCK! We alr...,1


In [12]:
print(train_tweets.shape)
print(type(train_tweets))

(13240, 2)
<class 'pandas.core.frame.DataFrame'>


- StackOverflow: [Convert pandas dataframe to NumPy array](https://stackoverflow.com/questions/13187778/convert-pandas-dataframe-to-numpy-array)
    - `dataframe.values` is a numpy array!

In [14]:
print(train_tweets.values.shape)
print(type(train_tweets.values))

(13240, 2)
<class 'numpy.ndarray'>


In [17]:
train_data, train_labels = train_tweets.values[:,0], train_tweets.values[:,1]
print(train_data.shape)
print(type(train_data))
print(train_labels.shape)
print(type(train_labels))


(13240,)
<class 'numpy.ndarray'>
(13240,)
<class 'numpy.ndarray'>


In [20]:
train_data[:10]

array(['@USER She should ask a few native Americans what their take on this is.',
       '@USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL',
       'Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT',
       '@USER Someone should\'veTaken" this piece of shit to a volcano. 😂"',
       '@USER @USER Obama wanted liberals &amp; illegals to move into red states',
       '@USER Liberals are all Kookoo !!!',
       '@USER @USER Oh noes! Tough shit.',
       '@USER was literally just talking about this lol all mass shootings like that have been set ups. it’s propaganda used to divide us on major issues like gun control and terrorism',
       '@USER Buy more icecream!!!',
       '@USER Canada doesn’t need another CUCK! We already have enough #LooneyLeft #Liberals f**king up our great country! #Qproofs #TrudeauMustGo'],
      dtype=object)

In [21]:
train_labels[:10]

array([1, 1, 0, 1, 0, 1, 1, 1, 0, 1], dtype=object)

- StackOverflow: [Better way to shuffle two numpy arrays in unison](https://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison)
    - numpy's [array indexing](https://docs.scipy.org/doc/numpy-1.10.1/user/basics.indexing.html)
```
assert len(a) == len(b)
p = numpy.random.permutation(len(a))
return a[p], b[p]
```

In [22]:
print(len(train_labels))
print(len(train_data))

13240
13240


## Preprocessing

```
>>> text = word_tokenize("And now for something completely different")
>>> nltk.pos_tag(text)
[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),
('completely', 'RB'), ('different', 'JJ')]
```

In [25]:
import nltk
nltk.download(['stopwords', 'punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kcava\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kcava\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kcava\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kcava\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [38]:
class Preprocessor:
    
    def tokenize(self, data):
        from nltk import word_tokenize
        for i, tweet in tqdm(enumerate(data), "Tokenization"):
            data[i] = word_tokenize(tweet.lower())
        return data
    
    def remove_stopwords(self, data):
        from nltk.corpus import stopwords
        import re
        stop = set(stopwords.words("english"))
        noise = ["user"]
        for i, tweet in tqdm(enumerate(data), "Stopwords Removal"):
            data[i] = [w for w in tweet if w not in stop and not re.match(r"[^a-zA-Z\d\s]+", w) and w not in noise]
        return data
    
    def get_pos(self, word):
        from nltk import pos_tag
        from nltk.corpus import wordnet
        tag = pos_tag([word])[0][1]
        
        if tag.startswith("V"):
            return wordnet.VERB
        elif tag.startswith("J"):
            return wordnet.ADJ
        elif tag.startswith("N"):
            return wordnet.NOUN
        elif tag.startswith("R"):
            return wordnet.ADV
        else: 
            return wordnet.NOUN
    
    def lemmatize(self, data):
        from nltk.stem import WordNetLemmatizer
        wnl = WordNetLemmatizer()
        for i, tweet in tqdm(enumerate(data), "Lemmatization"):
            for j, word in enumerate(tweet):
                data[i][j] = wnl.lemmatize(word, pos=self.get_pos(word))
        return data
    
    def stem(self, data):
        from nltk.stem import PorterStemmer
        stemmer = PorterStemmer()
        for i, tweet in tqdm(enumerate(self.data), "Stemming"):
            for j, word in enumerate(tweet):
                data[i][j] = stemmer.stem(word)
        return data
    
    # def word_cloud
    # def clean
        

## Vectorizing

In [40]:
# Imports
from gensim.models import Word2Vec, FastText, KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim.downloader as api
from os import listdir

In [None]:
class Vectorizer: 
    
    def __init__(self, type, pre_trained=False, retrain=False, 
                extend_training=False, params={}):
        
        self.type = type
        self.pre_trained = pre_trained
        self.params = params
        self.retrain = retrain
        self.extend_training = extend_training
        self.vectorizer = None
        self.max_len = None
    
    def word2vec(self, data):
        if not self.pre_trained:
            if 'word2vec.model' not in listdir('./embeddings') or self.retrain: 
                print("\nTraining Word2Vec model...")
                model = self.train_w2v()
            elif self.extend_training and 'word2vec-model' in listdir('./embeddings'):
                print("\nExtending existing Word2Vec model...")
                model = Word2Vec.load("./embeddings/word2vec.model")
                model.train(data, total_examples=len(data), epochs=5000)
                model.save("./embeddings/word2vec.model")
            else: 
                print("\nLoading existing Word2Vec model...")
                model = Word2Vec.load("./embeddings/word2vec.model")
        else: 
            model = Word2Vec(data, **self.params)
        
        vectorizer = model.wv
        self.vocab_length = len(model.vw.vocab)
        vectors = [
            np.array([vectorizer[word] for word in tweet if word in model]).flatten() for tweet in tqdm(data, "Vectorizing")
        ]
        
        if  not self.max_len:
            self.max_len = np.max([len(vector) for vector in vectors])
        self.vectors = [
            np.array(vector.tolist() + [0 for _ in range(self.max_len - len(vector))]) for vector in tqdm(data, "Vectorizing")
        ]
    

## Initializing objects

In [33]:
dr = DataReader()
train_data, train_labels = dr.get_np_data_and_labels()
print(train_data[:10])
print(train_labels[:10])

['@USER She should ask a few native Americans what their take on this is.'
 '@USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL'
 'Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT'
 '@USER Someone should\'veTaken" this piece of shit to a volcano. 😂"'
 '@USER @USER Obama wanted liberals &amp; illegals to move into red states'
 '@USER Liberals are all Kookoo !!!' '@USER @USER Oh noes! Tough shit.'
 '@USER was literally just talking about this lol all mass shootings like that have been set ups. it’s propaganda used to divide us on major issues like gun control and terrorism'
 '@USER Buy more icecream!!!'
 '@USER Canada doesn’t need another CUCK! We already have enough #LooneyLeft #Liberals f**king up our great country! #Qproofs #TrudeauMustGo']
[1 1 0 1 0 1 1 1 0 1]


In [None]:
preprocessors = [('lemmatize'), 
                 ('remove_stopwords', 'lemmatize'), 
                 ('remove_stopwords', 'stem'), 
                 ('remove_stopwords', 'lemmatize'),
                 ('remove_stopwords', 'lemmatize'),
                 ('remove_stopwords')
                ]

vectorizers = ['count', 'count', 'count', 'tfidf', 'glove']

