# Inspect the OLID dataset

This notebook: 
- inspect data a little bit
- convert categorical labels to {0,1} labels
- save train and test data to .csv files

In [1]:
# Imports

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## 1. Load Training Data

In [2]:
data = pd.read_csv('../Dataset-OLID/OLIDv1.0/olid-training-v1.0.tsv', sep='\t')
data.head(10)

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,
5,97670,@USER Liberals are all Kookoo !!!,OFF,TIN,OTH
6,77444,@USER @USER Oh noes! Tough shit.,OFF,UNT,
7,52415,@USER was literally just talking about this lo...,OFF,TIN,GRP
8,45157,@USER Buy more icecream!!!,NOT,,
9,13384,@USER Canada doesn’t need another CUCK! We alr...,OFF,TIN,IND


In [3]:
data.shape

(13240, 5)

In [4]:
dataA = data.drop(["subtask_b", "subtask_c"], axis=1)
dataA.head(10)

Unnamed: 0,id,tweet,subtask_a
0,86426,@USER She should ask a few native Americans wh...,OFF
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF
2,16820,Amazon is investigating Chinese employees who ...,NOT
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT
5,97670,@USER Liberals are all Kookoo !!!,OFF
6,77444,@USER @USER Oh noes! Tough shit.,OFF
7,52415,@USER was literally just talking about this lo...,OFF
8,45157,@USER Buy more icecream!!!,NOT
9,13384,@USER Canada doesn’t need another CUCK! We alr...,OFF


- **StackOverflow:** [Pandas: convert categories to numbers](https://stackoverflow.com/questions/38088652/pandas-convert-categories-to-numbers)

In [5]:
dataA.subtask_a = pd.Categorical(dataA.subtask_a)
dataA.head(10)

Unnamed: 0,id,tweet,subtask_a
0,86426,@USER She should ask a few native Americans wh...,OFF
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF
2,16820,Amazon is investigating Chinese employees who ...,NOT
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT
5,97670,@USER Liberals are all Kookoo !!!,OFF
6,77444,@USER @USER Oh noes! Tough shit.,OFF
7,52415,@USER was literally just talking about this lo...,OFF
8,45157,@USER Buy more icecream!!!,NOT
9,13384,@USER Canada doesn’t need another CUCK! We alr...,OFF


In [6]:
dataA['label_a'] = dataA.subtask_a.cat.codes
dataA.head(10)

Unnamed: 0,id,tweet,subtask_a,label_a
0,86426,@USER She should ask a few native Americans wh...,OFF,1
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,1
2,16820,Amazon is investigating Chinese employees who ...,NOT,0
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,1
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,0
5,97670,@USER Liberals are all Kookoo !!!,OFF,1
6,77444,@USER @USER Oh noes! Tough shit.,OFF,1
7,52415,@USER was literally just talking about this lo...,OFF,1
8,45157,@USER Buy more icecream!!!,NOT,0
9,13384,@USER Canada doesn’t need another CUCK! We alr...,OFF,1


## 2. Collect some data info

In [7]:
print("A not nan:", 1-data.subtask_a.isna().sum()/13240.)

A not nan: 1.0


In [8]:
dataA_info = dataA.groupby(["subtask_a"]).count()

In [9]:
dataA_info

Unnamed: 0_level_0,id,tweet,label_a
subtask_a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NOT,8840,8840,8840
OFF,4400,4400,4400


In [10]:
#dataA_info.columns = ["Total", "Percentage"]

ValueError: Length mismatch: Expected axis has 3 elements, new values have 2 elements

In [None]:
dataA_info

In [None]:
dataA_info.Percentage = dataA_info.Percentage/dataA_info.Percentage.sum()

In [None]:
dataA_info

## 3. Loading Testset Level A

In [None]:
test_tweets = pd.read_csv('../Dataset-OLID/OLIDv1.0/testset-levela.tsv', sep='\t')
test_tweets.head(10)

In [None]:
test_labels = pd.read_csv('../Dataset-OLID/OLIDv1.0/labels-levela.csv')
test_labels.head(10)

- **StackOverflow:** [JOIN two dataframes on common column in python
](https://stackoverflow.com/questions/41463119/join-two-dataframes-on-common-column-in-python)

In [None]:
test_data = pd.merge(test_tweets, test_labels, left_on='id', right_on='id', how='left')
test_data.head(10)

In [None]:
test_data = test_data.rename(columns={'label':'subtask_a'})
test_data.head(10)

In [None]:
test_data.subtask_a = pd.Categorical(test_data.subtask_a)
test_data['label_a'] = test_data.subtask_a.cat.codes
test_data.head(10)

## 4. Save data in csv

- **StackOverflow:** [Writing a pandas DataFrame to CSV file](https://stackoverflow.com/questions/16923281/writing-a-pandas-dataframe-to-csv-file)

In [None]:
dataA.to_csv("../Dataset-OLID/OLIDv1.0/data_subtask_a.csv")
test_data.to_csv("../Dataset-OLID/OLIDv1.0/test_data_subtask_a.csv")

## 5. Test data

In [None]:
test_data = pd.read_csv("../Dataset-OLID/OLIDv1.0/data_subtask_a.csv")