# Train Deep Learning Networks on OLID Dataset - 2

In [3]:
# Imports

import numpy as np
import pandas as pd
import csv
from tqdm import tqdm
from sklearn.utils import shuffle


## Reading data

In [23]:
class DataReader:
    def __init__(self, folder="../Dataset-OLID/OLIDv1.0/", 
                 task_a="data_subtask_a.csv"):
        self.folder = folder
        self.task_a = task_a
        
    def get_df_train_data(self):
        train_data = pd.read_csv(self.folder + self.task_a)
        train_tweets = train_data.drop(["Unnamed: 0", "id", "subtask_a"], axis=1)
        return train_tweets
    
    def get_df_data(self, file="data_subtask_a.csv"):
        data = pd.read_csv(self.folder + file)
        train_tweets = data.drop(["Unnamed: 0", "id", "subtask_a"], axis=1)
        return train_tweets
    
    def get_np_data_labels(self, file="data_subtask_a.csv"):
        tweets = self.get_data(file)
        data, labels = tweets.values[:,0], tweets.values[:,1]
        return data, labels
    
    # this creates copies
    def shuffle_np(self, data, labels):
        assert len(data) == len(labels)
        p = np.random.permutation(len(data))
        return data[p], labels[p]
        

In [10]:
dr = DataReader()
train_tweets = dr.get_train_data()
train_tweets.head(10)


Unnamed: 0,tweet,label_a
0,@USER She should ask a few native Americans wh...,1
1,@USER @USER Go home you’re drunk!!! @USER #MAG...,1
2,Amazon is investigating Chinese employees who ...,0
3,"@USER Someone should'veTaken"" this piece of sh...",1
4,@USER @USER Obama wanted liberals &amp; illega...,0
5,@USER Liberals are all Kookoo !!!,1
6,@USER @USER Oh noes! Tough shit.,1
7,@USER was literally just talking about this lo...,1
8,@USER Buy more icecream!!!,0
9,@USER Canada doesn’t need another CUCK! We alr...,1


In [12]:
print(train_tweets.shape)
print(type(train_tweets))

(13240, 2)
<class 'pandas.core.frame.DataFrame'>


- StackOverflow: [Convert pandas dataframe to NumPy array](https://stackoverflow.com/questions/13187778/convert-pandas-dataframe-to-numpy-array)
    - `dataframe.values` is a numpy array!

In [14]:
print(train_tweets.values.shape)
print(type(train_tweets.values))

(13240, 2)
<class 'numpy.ndarray'>


In [17]:
train_data, train_labels = train_tweets.values[:,0], train_tweets.values[:,1]
print(train_data.shape)
print(type(train_data))
print(train_labels.shape)
print(type(train_labels))


(13240,)
<class 'numpy.ndarray'>
(13240,)
<class 'numpy.ndarray'>


In [20]:
train_data[:10]

array(['@USER She should ask a few native Americans what their take on this is.',
       '@USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL',
       'Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT',
       '@USER Someone should\'veTaken" this piece of shit to a volcano. 😂"',
       '@USER @USER Obama wanted liberals &amp; illegals to move into red states',
       '@USER Liberals are all Kookoo !!!',
       '@USER @USER Oh noes! Tough shit.',
       '@USER was literally just talking about this lol all mass shootings like that have been set ups. it’s propaganda used to divide us on major issues like gun control and terrorism',
       '@USER Buy more icecream!!!',
       '@USER Canada doesn’t need another CUCK! We already have enough #LooneyLeft #Liberals f**king up our great country! #Qproofs #TrudeauMustGo'],
      dtype=object)

In [21]:
train_labels[:10]

array([1, 1, 0, 1, 0, 1, 1, 1, 0, 1], dtype=object)

- StackOverflow: [Better way to shuffle two numpy arrays in unison](https://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison)
    - numpy's [array indexing](https://docs.scipy.org/doc/numpy-1.10.1/user/basics.indexing.html)
```
assert len(a) == len(b)
p = numpy.random.permutation(len(a))
return a[p], b[p]
```

In [22]:
print(len(train_labels))
print(len(train_data))

13240
13240
