# Data Merging & Splitting

In [1]:
import pandas as pd
import numpy as np
import json
import os
from sklearn.model_selection import train_test_split
import random

random.seed(210)

## Load Reference Data

In [2]:
### Training
### Load Data
with open('training_set_task3.txt') as f:
    train = json.load(f)
    
### Dev
with open('dev_set_task3_labeled.txt') as f:
    dev = json.load(f)

### Test
with open('test_set_task3.txt') as f:
    test = json.load(f)

In [3]:
train[0:3]

[{'id': '128',
  'labels': ['Black-and-white Fallacy/Dictatorship',
   'Name calling/Labeling',
   'Smears'],
  'text': 'THERE ARE ONLY TWO GENDERS\n\nFEMALE \n\nMALE\n',
  'image': '128_image.png'},
 {'id': '189',
  'labels': ['Reductio ad hitlerum', 'Smears', 'Transfer'],
  'text': 'This is not an accident!',
  'image': '189_image.png'},
 {'id': '96',
  'labels': ['Appeal to fear/prejudice',
   'Loaded Language',
   'Name calling/Labeling',
   'Slogans',
   'Smears',
   'Transfer'],
  'text': "SO BERNIE BROS HAVEN'T COMMITTED VIOLENCE EH?\n\nPOWER COMES FROM THE BARREL OF A GUN, COMRADES.\n\nWHAT ABOUT THE ONE WHO SHOT CONGRESSMAN SCALISE OR THE DAYTON OHIO MASS SHOOTER?\n",
  'image': '96_image.png'}]

In [4]:
len(train)

687

In [5]:
dev[0:3]

[{'id': '62_batch_2',
  'labels': ['Smears', 'Doubt'],
  'text': '*President* Biden?\n\nPlease, no.\n',
  'image': '62_image_batch_2.png'},
 {'id': '111_batch_2',
  'labels': ['Smears', 'Loaded Language', 'Name calling/Labeling'],
  'text': 'JOE  VERSUS THE VOLCANIC KREMLIN DON\n\n"WILL YOU SHUT UP, MAN?"\n\nLORD OF THE LIES',
  'image': '111_image_batch_2.png'},
 {'id': '167_batch_2',
  'labels': ['Smears', 'Transfer'],
  'text': 'ANTI-VAXXERS BE LIKE... \n\nHANG ON A SEC - JUST COLLECTING MY FACTS',
  'image': '167_image_batch_2.png'}]

In [6]:
len(dev)

63

In [7]:
test[0:3]

[{'id': '705_batch_2',
  'labels': ['Name calling/Labeling', 'Slogans', 'Smears', 'Transfer'],
  'text': 'The Democrats New America\n',
  'image': '705_image_batch_2.png'},
 {'id': '706_batch_2',
  'labels': ['Appeal to (Strong) Emotions',
   'Appeal to fear/prejudice',
   'Loaded Language',
   'Name calling/Labeling'],
  'text': 'WE ARE AT WAR!\n\nThere is a complex assault on our presidential election\nwith all the multifaceted tentacles of a terrorist organization and operation\n\n2020\nTERRORISM\n',
  'image': '706_image_batch_2.png'},
 {'id': '710_batch_2',
  'labels': ['Doubt', 'Loaded Language', 'Name calling/Labeling'],
  'text': 'KILLED HIMSELF IN PRISON\n\nWON AN HONEST ELECTION\n',
  'image': '710_image_batch_2.png'}]

In [8]:
len(test)

200

In [9]:
len(train) + len(dev) + len(test)

950

## Check for empty labels

In [10]:
l1 = 0
for i in range(len(train)):
    if len(train[i]['labels']) == 0:
        l1 +=1
        print(i)

125
130
136
159
241
253
260
273
295
434
455
511
512
532
559
567
573
586
611
612
613
615
617
619
662
663
664
677
678


In [11]:
l2 = 0
for i in range(len(dev)):
    if len(dev[i]['labels']) == 0:
        l2 +=1
        print(i)

3
28
45


In [12]:
l3 = 0
for i in range(len(test)):
    if len(test[i]['labels']) == 0:
        l3+=1
        print(i)

172
178
189
194


In [13]:
l1+l2+l3

36

## Remove Empty Labels

In [16]:
newTrain = []
for i in range(len(train)):
    if (len(train[i]['labels']) != 0):
        newTrain.append(train[i])
        
newDev = []
for i in range(len(dev)):
    if (len(dev[i]['labels']) != 0):
        newDev.append(dev[i])
        
newTest = []
for i in range(len(test)):
    if (len(test[i]['labels']) != 0):
        newTest.append(test[i])

In [17]:
print(len(train))
print(len(newTrain))

687
658


In [18]:
print(len(dev))
print(len(newDev))

63
60


In [19]:
print(len(test))
print(len(newTest))

200
196


In [21]:
(len(train) + len(dev) + len(test)) - (len(newTrain) + len(newDev) + len(newTest))

36

## Merge Data

In [22]:
data = newTrain
data.extend(newDev)
data.extend(newTest)
len(data)

914

In [23]:
labels_indv = []
for i in range(len(data)):
    for j in data[i]['labels']:
        labels_indv.append(j)

In [24]:
def countFreq(mylist):
    freq = {}
    for item in mylist:
        if (item in freq):
            freq[item] += 1
        else:
            freq[item] = 1
            
    for key, value in freq.items():
        print("% s : % d"%(key, value))
    return freq

In [25]:
labels_indv_freq = countFreq(labels_indv)

Black-and-white Fallacy/Dictatorship :  26
Name calling/Labeling :  347
Smears :  602
Reductio ad hitlerum :  23
Transfer :  95
Appeal to fear/prejudice :  91
Loaded Language :  492
Slogans :  70
Causal Oversimplification :  36
Glittering generalities (Virtue) :  112
Flag-waving :  55
Misrepresentation of Someone's Position (Straw Man) :  40
Exaggeration/Minimisation :  99
Repetition :  14
Appeal to (Strong) Emotions :  90
Doubt :  111
Obfuscation, Intentional vagueness, Confusion :  7
Whataboutism :  67
Thought-terminating cliché :  27
Presenting Irrelevant Data (Red Herring) :  7
Appeal to authority :  35
Bandwagon :  5


#### __The lowest frequency labels are Bandwagon, Presenting Irrelevant Data (Red Herring), and Obfuscation, Intentional vagueness, Confusion__

In [26]:
### First, grab the indexes where those two labels occur
idxs = []
for idx in range(len(data)):
    if (('Bandwagon' in data[idx]['labels']) or ('Presenting Irrelevant Data (Red Herring)' in data[idx]['labels']) or 
       ('Obfuscation, Intentional vagueness, Confusion' in data[idx]['labels'])):
        idxs.append(idx)

In [27]:
idxs

[21,
 51,
 59,
 105,
 156,
 293,
 447,
 481,
 533,
 676,
 699,
 727,
 746,
 763,
 815,
 842,
 860,
 873,
 879]

In [28]:
len(idxs)

19

In [29]:
for i in idxs:
    print(data[i])

{'id': '127', 'labels': ['Loaded Language', "Misrepresentation of Someone's Position (Straw Man)", 'Name calling/Labeling', 'Obfuscation, Intentional vagueness, Confusion', 'Smears'], 'text': 'THE PEOPLE WHO ONCE CALLED OUR SOLDIERS BABY KILLERS\n\nNOW MARCH FOR THE RIGHT TO KILL BABIES\n', 'image': '127_image.png'}
{'id': '160', 'labels': ['Presenting Irrelevant Data (Red Herring)'], 'text': 'IF WE DIVIDE 125 GENDERS BY THREE BATHROOMS\n\nHOW MUCH CLIMATE CHANGE DO WE HAVE?', 'image': '160_image.png'}
{'id': '14', 'labels': ['Appeal to (Strong) Emotions', 'Glittering generalities (Virtue)', 'Loaded Language', "Misrepresentation of Someone's Position (Straw Man)", 'Obfuscation, Intentional vagueness, Confusion', 'Smears'], 'text': '21 people running for President and only 1 stands against killing babies\n', 'image': '14_image.png'}
{'id': '107_batch_2', 'labels': ['Bandwagon', 'Exaggeration/Minimisation', 'Loaded Language', 'Smears'], 'text': "If you support this man, I won't judge you

In [30]:
### Now make two separate lists.
### One of just the 19 observations
### Another of the rest

obs = []
for i in idxs:
    obs.append(data[i])

### Remove observations
newData = []
for i in range(len(data)):
    if (('Bandwagon' not in data[i]['labels']) and ('Presenting Irrelevant Data (Red Herring)' not in data[i]['labels'])
       and ('Obfuscation, Intentional vagueness, Confusion' not in data[i]['labels'])):
        newData.append(data[i])

In [31]:
print(len(data))
print(len(newData))

914
895


In [32]:
### Double check that the labels have been removed
for i in range(len(newData)):
     if (('Bandwagon' in newData[i]['labels']) or ('Presenting Irrelevant Data (Red Herring)' in newData[i]['labels']) or
        ('Obfuscation, Intentional vagueness, Confusion' in newData[i]['labels'])):
            print('Still here: ' + str(i))
print('done')

done


In [33]:
### Create a training and testing split
data_split = train_test_split(newData, test_size=0.2, random_state=210)

In [34]:
### Create training and testing sets
training_data = data_split[0]
testing_data = data_split[1]

In [35]:
print(len(training_data))
print(len(testing_data))

716
179


In [36]:
for i in range(len(obs)):
    print('{}: {}'.format(i,obs[i]['labels']))

0: ['Loaded Language', "Misrepresentation of Someone's Position (Straw Man)", 'Name calling/Labeling', 'Obfuscation, Intentional vagueness, Confusion', 'Smears']
1: ['Presenting Irrelevant Data (Red Herring)']
2: ['Appeal to (Strong) Emotions', 'Glittering generalities (Virtue)', 'Loaded Language', "Misrepresentation of Someone's Position (Straw Man)", 'Obfuscation, Intentional vagueness, Confusion', 'Smears']
3: ['Bandwagon', 'Exaggeration/Minimisation', 'Loaded Language', 'Smears']
4: ['Loaded Language', "Misrepresentation of Someone's Position (Straw Man)", 'Obfuscation, Intentional vagueness, Confusion', 'Smears']
5: ['Glittering generalities (Virtue)', 'Presenting Irrelevant Data (Red Herring)']
6: ['Bandwagon', 'Loaded Language', 'Thought-terminating cliché']
7: ['Loaded Language', "Misrepresentation of Someone's Position (Straw Man)", 'Name calling/Labeling', 'Obfuscation, Intentional vagueness, Confusion', 'Whataboutism']
8: ['Obfuscation, Intentional vagueness, Confusion']
9: 

In [37]:
### Now add 3 observation of Bandwagon and  into the training and 2 into testing 
### Put 4 Obfuscation, Intentional vagueness, Confusion and Presenting Irrelevant Data (Red Herring) into training and 3 into testing

### There are instances where Obfuscation, Intentional vagueness, Confusion and Presenting Irrelevant Data (Red Herring) are the only label. 
### Those should go into training data so model can learn those.

### At random locations

training_data.insert(random.randint(0, len(training_data)), obs[0])
training_data.insert(random.randint(0, len(training_data)), obs[1])
training_data.insert(random.randint(0, len(training_data)), obs[2])
training_data.insert(random.randint(0, len(training_data)), obs[5])
training_data.insert(random.randint(0, len(training_data)), obs[6])
training_data.insert(random.randint(0, len(training_data)), obs[8])
training_data.insert(random.randint(0, len(training_data)), obs[9])
training_data.insert(random.randint(0, len(training_data)), obs[12])
training_data.insert(random.randint(0, len(training_data)), obs[13])
training_data.insert(random.randint(0, len(training_data)), obs[15])
training_data.insert(random.randint(0, len(training_data)), obs[17])

testing_data.insert(random.randint(0, len(testing_data)), obs[3])
testing_data.insert(random.randint(0, len(testing_data)), obs[4])
testing_data.insert(random.randint(0, len(testing_data)), obs[7])
testing_data.insert(random.randint(0, len(testing_data)), obs[10])
testing_data.insert(random.randint(0, len(testing_data)), obs[11])
testing_data.insert(random.randint(0, len(testing_data)), obs[14])
testing_data.insert(random.randint(0, len(testing_data)), obs[16])
testing_data.insert(random.randint(0, len(testing_data)), obs[18])

In [38]:
print(len(training_data))
print(len(testing_data))

727
187


In [39]:
print(len(data))
print(len(training_data) + len(testing_data))

914
914


In [40]:
training_labels_indv = []
for i in range(len(training_data)):
    for j in training_data[i]['labels']:
        training_labels_indv.append(j)
        
testing_labels_indv = []
for i in range(len(testing_data)):
    for j in testing_data[i]['labels']:
        testing_labels_indv.append(j)

In [41]:
training_labels_indv_freq = countFreq(training_labels_indv)

Exaggeration/Minimisation :  69
Name calling/Labeling :  280
Smears :  485
Transfer :  81
Causal Oversimplification :  33
Loaded Language :  394
Repetition :  11
Slogans :  50
Black-and-white Fallacy/Dictatorship :  16
Appeal to (Strong) Emotions :  72
Whataboutism :  57
Appeal to fear/prejudice :  77
Presenting Irrelevant Data (Red Herring) :  4
Doubt :  90
Misrepresentation of Someone's Position (Straw Man) :  34
Glittering generalities (Virtue) :  81
Appeal to authority :  25
Flag-waving :  41
Bandwagon :  3
Reductio ad hitlerum :  20
Thought-terminating cliché :  20
Obfuscation, Intentional vagueness, Confusion :  4


In [42]:
testing_labels_indv_freq = countFreq(testing_labels_indv)

Appeal to (Strong) Emotions :  18
Smears :  117
Black-and-white Fallacy/Dictatorship :  10
Flag-waving :  14
Loaded Language :  98
Name calling/Labeling :  67
Glittering generalities (Virtue) :  31
Transfer :  14
Appeal to fear/prejudice :  14
Causal Oversimplification :  3
Exaggeration/Minimisation :  30
Slogans :  20
Appeal to authority :  10
Doubt :  21
Whataboutism :  10
Misrepresentation of Someone's Position (Straw Man) :  6
Obfuscation, Intentional vagueness, Confusion :  3
Presenting Irrelevant Data (Red Herring) :  3
Thought-terminating cliché :  7
Repetition :  3
Reductio ad hitlerum :  3
Bandwagon :  2


In [43]:
len(training_labels_indv_freq)

22

In [44]:
len(testing_labels_indv_freq)

22

In [45]:
### Now save these datasets
with open('training_data_task3.txt', 'w') as f:
    json.dump(training_data, f)
f.close()

with open('testing_data_task3.txt', 'w') as f:
    json.dump(testing_data, f)
f.close()