# Data Prep for MultiModal Modeling

## Author: Ian Scarff (iscarff123)

In [1]:
import pandas as pd
import numpy as np
import json
import os
import imageio
import random
import sklearn
from sklearn.preprocessing import MultiLabelBinarizer
import time
import datetime
from hatesonar import Sonar # This is the hate speech detection library; it is based on bert
from transformers import pipeline

## Load in reference data.

In [2]:
### Load in reference data

with open('training_data_task3.txt') as f: ### Training data
    training = json.load(f)
    
with open('testing_data_task3.txt') as f: ### Testing data
    testing = json.load(f)
    
classes = ['Smears', 'Loaded Language', 'Name calling/Labeling', 'Glittering generalities (Virtue)',
               'Appeal to (Strong) Emotions', 'Appeal to fear/prejudice', 'Transfer', 'Doubt',
               'Exaggeration/Minimisation', 'Whataboutism', 'Slogans', 'Flag-waving',
               "Misrepresentation of Someone's Position (Straw Man)", 'Causal Oversimplification',
               'Thought-terminating cliché', 'Black-and-white Fallacy/Dictatorship', 'Appeal to authority',
               'Reductio ad hitlerum', 'Repetition', 'Obfuscation, Intentional vagueness, Confusion',
               'Presenting Irrelevant Data (Red Herring)', 'Bandwagon']

### Create Class Binarizer
one_hot = MultiLabelBinarizer()
one_hot.fit([classes])

MultiLabelBinarizer()

In [3]:
one_hot.classes_

array(['Appeal to (Strong) Emotions', 'Appeal to authority',
       'Appeal to fear/prejudice', 'Bandwagon',
       'Black-and-white Fallacy/Dictatorship',
       'Causal Oversimplification', 'Doubt', 'Exaggeration/Minimisation',
       'Flag-waving', 'Glittering generalities (Virtue)',
       'Loaded Language',
       "Misrepresentation of Someone's Position (Straw Man)",
       'Name calling/Labeling',
       'Obfuscation, Intentional vagueness, Confusion',
       'Presenting Irrelevant Data (Red Herring)', 'Reductio ad hitlerum',
       'Repetition', 'Slogans', 'Smears', 'Thought-terminating cliché',
       'Transfer', 'Whataboutism'], dtype=object)

In [4]:
training[0:3]

[{'id': '182',
  'labels': ['Exaggeration/Minimisation',
   'Name calling/Labeling',
   'Smears',
   'Transfer'],
  'text': 'The most costly errors in all of history\n\nWorld Health Organization (WHO)\nPreliminary investigations conducted by the Chinese authorities have found no clear evidence of human-to-human transmission of the novel #coronavirus (2019-nCoV) identified in #Wuhan, #China\n14 Jan 2020\n\nWORLD NEWS FEBRUARY 3, 2020 / 10:33 PM / 2 MONTHS AGO\nWHO chief says widespread travel bans not needed to beat China virus\n',
  'image': '182_image.png'},
 {'id': '366_batch_2',
  'labels': ['Causal Oversimplification',
   'Loaded Language',
   'Name calling/Labeling'],
  'text': 'MY PARENTS WERE KILLED AS A RESULT OF A GUN FREE ZONE.\n\nI CARRIED A GUN AT ALL TIMES TILL IT WAS MADE A FELONY TO CARRY IN CERTAIN AREAS. \nA MAN WITH A GUN STARTED SHOOTING PEOPLE...\nI HAD THE PERFECT SHOT BUT WAS DISARMED BY LAWS.\n\n',
  'image': '366_image_batch_2.png'},
 {'id': '148',
  'labels': [

In [5]:
len(training)

727

In [6]:
testing[0:3]

[{'id': '566_batch_2',
  'labels': ['Appeal to (Strong) Emotions', 'Smears'],
  'text': "A FRIENDLY REMINDER...\n\nFERGUSON\n\nBALTIMORE\n\nMILWAUKEE\n\nCHARLOTTE\n\nIN CASE Y'ALL FORGOT !\n",
  'image': '566_image_batch_2.png'},
 {'id': '738_batch_2',
  'labels': ['Black-and-white Fallacy/Dictatorship',
   'Flag-waving',
   'Loaded Language',
   'Name calling/Labeling'],
  'text': "We're fed up.\nWe're not asking for our country back.\nWe're TAKING IT BACK\nWE THE PEOPLE ARE COMING\nAmerica was founded by pissed off people.\nAnd they're pissed again!",
  'image': '738_image_batch_2.png'},
 {'id': '885_batch_2',
  'labels': ['Smears'],
  'text': "PRESIDENTS' DAY SALE\n\nEVERYONE MUST GO",
  'image': '885_image_batch_2.png'}]

In [7]:
len(testing)

187

In [8]:
len(training) + len(testing)

914

## Extract Image Names, Text, & Labels

### Training data

In [9]:
image_name_train = []
text_train = []
one_hot_labels_train = [] ### List to hold one hot labels

for obsv in training: ### Go through the training data

    ### Images ###
    image_name_train.append(obsv['image'])
    
    text_train.append(obsv['text'])
    
    ### Labels
    one_hot_labels_train.append(one_hot.transform([obsv['labels']])[0])


In [10]:
image_name_train[0:3]

['182_image.png', '366_image_batch_2.png', '148_image.png']

In [11]:
text_train[0:3]

['The most costly errors in all of history\n\nWorld Health Organization (WHO)\nPreliminary investigations conducted by the Chinese authorities have found no clear evidence of human-to-human transmission of the novel #coronavirus (2019-nCoV) identified in #Wuhan, #China\n14 Jan 2020\n\nWORLD NEWS FEBRUARY 3, 2020 / 10:33 PM / 2 MONTHS AGO\nWHO chief says widespread travel bans not needed to beat China virus\n',
 'MY PARENTS WERE KILLED AS A RESULT OF A GUN FREE ZONE.\n\nI CARRIED A GUN AT ALL TIMES TILL IT WAS MADE A FELONY TO CARRY IN CERTAIN AREAS. \nA MAN WITH A GUN STARTED SHOOTING PEOPLE...\nI HAD THE PERFECT SHOT BUT WAS DISARMED BY LAWS.\n\n',
 "VOTE IT OUT\nVOTE IT OUT\n\nHE SAID IT WASN'T REAL.\nHE SAID IT'S CONTAINED.\nHE SAID IT WOULD DISAPPEAR.\nHE BLAMED DEMOCRATS.\nHE SHARED HIS HUNCHES.\nHE OVERRULED HIS SCIENTISTS.\nHE HAS FAILED.\nHE HAS ENDANGERED\nEVERY ONE OF US.\n\nVOTE IT OUT\nVOTE IT OUT\n"]

In [12]:
one_hot_labels_train[0:3]

[array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0]),
 array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0])]

### Testing data

In [13]:
image_name_test = []
text_test = []
one_hot_labels_test = [] ### List to hold one hot labels

for obsv in testing: ### Go through the test data

    ### Images ###
    image_name_test.append(obsv['image'])
    
    text_test.append(obsv['text'])
    
    ### Labels
    one_hot_labels_test.append(one_hot.transform([obsv['labels']])[0])

In [14]:
image_name_test[0:3]

['566_image_batch_2.png', '738_image_batch_2.png', '885_image_batch_2.png']

In [15]:
text_test[0:3]

["A FRIENDLY REMINDER...\n\nFERGUSON\n\nBALTIMORE\n\nMILWAUKEE\n\nCHARLOTTE\n\nIN CASE Y'ALL FORGOT !\n",
 "We're fed up.\nWe're not asking for our country back.\nWe're TAKING IT BACK\nWE THE PEOPLE ARE COMING\nAmerica was founded by pissed off people.\nAnd they're pissed again!",
 "PRESIDENTS' DAY SALE\n\nEVERYONE MUST GO"]

In [16]:
one_hot_labels_test[0:3]

[array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]),
 array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])]

## Text Preprocessing

In [17]:
# Create an object of Sonar Hate Speech Detection
sonar = Sonar()



In [18]:
def hate_speech_classifier(text, Class):
    for i in text:
        sonar_dict = sonar.ping(text=i)
        Class.append(list(sonar_dict.values())[1])
        
def sentiment_classifier(text):
    sent = []
    k = 1
    for i in text:
        result = sentimentanalyzer(text)[0]
        print(str(k) + ' Done')
        sent.append(result['label'])
        k+=1
    return sent

In [19]:
sentimentanalyzer = pipeline("sentiment-analysis", device = 1)

### Training Data

#### Hate Speech

In [20]:
### Training data
hate_speech_class_train = []
hate_speech_classifier(text_train, hate_speech_class_train)

In [21]:
hate_speech_class_train[0:10]

['neither',
 'neither',
 'neither',
 'offensive_language',
 'neither',
 'hate_speech',
 'neither',
 'neither',
 'neither',
 'neither']

In [22]:
# 0 = neither, 1 = hate_speech, 2 = offensive_language

hate_speech_label_train = []

for i in hate_speech_class_train:
    if i == 'neither':
        hate_speech_label_train.append(0)
        
    elif i == 'hate_speech':
        hate_speech_label_train.append(1)
        
    elif i == 'offensive_language':
        hate_speech_label_train.append(2)

In [23]:
hate_speech_label_train[0:10]

[0, 0, 0, 2, 0, 1, 0, 0, 0, 0]

#### Sentiment

In [24]:
sentiment_train = sentiment_classifier(text_train)

1 Done
2 Done
3 Done
4 Done
5 Done
6 Done
7 Done
8 Done
9 Done
10 Done
11 Done
12 Done
13 Done
14 Done
15 Done
16 Done
17 Done
18 Done
19 Done
20 Done
21 Done
22 Done
23 Done
24 Done
25 Done
26 Done
27 Done
28 Done
29 Done
30 Done
31 Done
32 Done
33 Done
34 Done
35 Done
36 Done
37 Done
38 Done
39 Done
40 Done
41 Done
42 Done
43 Done
44 Done
45 Done
46 Done
47 Done
48 Done
49 Done
50 Done
51 Done
52 Done
53 Done
54 Done
55 Done
56 Done
57 Done
58 Done
59 Done
60 Done
61 Done
62 Done
63 Done
64 Done
65 Done
66 Done
67 Done
68 Done
69 Done
70 Done
71 Done
72 Done
73 Done
74 Done
75 Done
76 Done
77 Done
78 Done
79 Done
80 Done
81 Done
82 Done
83 Done
84 Done
85 Done
86 Done
87 Done
88 Done
89 Done
90 Done
91 Done
92 Done
93 Done
94 Done
95 Done
96 Done
97 Done
98 Done
99 Done
100 Done
101 Done
102 Done
103 Done
104 Done
105 Done
106 Done
107 Done
108 Done
109 Done
110 Done
111 Done
112 Done
113 Done
114 Done
115 Done
116 Done
117 Done
118 Done
119 Done
120 Done
121 Done
122 Done
123 Done
1

In [25]:
set(sentiment_train)

{'NEGATIVE'}

### All sentiment is negative. Since there is no variation, the models will only learn negative. This won't add anything to modeling. This will be left out in model training.

### Test Data

#### Hate Speech

In [26]:
### Training data
hate_speech_class_test = []
hate_speech_classifier(text_test, hate_speech_class_test)

In [27]:
hate_speech_class_test[0:10]

['neither',
 'neither',
 'neither',
 'neither',
 'neither',
 'neither',
 'neither',
 'neither',
 'neither',
 'offensive_language']

In [28]:
# 0 = neither, 1 = hate_speech, 2 = offensive_language

hate_speech_label_test = []

for i in hate_speech_class_test:
    if i == 'neither':
        hate_speech_label_test.append(0)
        
    elif i == 'hate_speech':
        hate_speech_label_test.append(1)
        
    elif i == 'offensive_language':
        hate_speech_label_test.append(2)

In [29]:
hate_speech_label_test[0:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 2]

## Assemble DataFrames

### Training

In [30]:
training_df = pd.DataFrame({'Image' : image_name_train,
                            'Text' : text_train,
                            'Hate' : hate_speech_label_train,
                            'Labels' : one_hot_labels_train})

In [31]:
training_df

Unnamed: 0,Image,Text,Hate,Labels
0,182_image.png,The most costly errors in all of history\n\nWo...,0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ..."
1,366_image_batch_2.png,MY PARENTS WERE KILLED AS A RESULT OF A GUN FR...,0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ..."
2,148_image.png,VOTE IT OUT\nVOTE IT OUT\n\nHE SAID IT WASN'T ...,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
3,58_image_batch_2.png,IF YOU DONT LISTEN TO DESPACITO YOU AIN'T LATI...,2,"[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
4,816_image_batch_2.png,FAIR AND BALANCED\n\nFAUX NEWS\n\nWE DISTORT Y...,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
...,...,...,...,...
722,687_image_batch_2.png,"Asked what the ""D.C."" stands for in ""Washingto...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
723,516_image_batch_2.png,BREAKING NEWS: The Chicago Police Dept has rep...,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
724,45_image_batch_2.png,IF WE GIVE UP EVERYTHING THAT OFFENDS SOMEONE ...,0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
725,86_image_batch_2.png,"YOU'D THINK YOU WERE IN A WHORE HOUSE, SEEING ...",2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, ..."


### Testing

In [32]:
test_df = pd.DataFrame({'Image' : image_name_test,
                            'Text' : text_test,
                            'Hate' : hate_speech_label_test,
                            'Labels' : one_hot_labels_test})

In [33]:
test_df

Unnamed: 0,Image,Text,Hate,Labels
0,566_image_batch_2.png,A FRIENDLY REMINDER...\n\nFERGUSON\n\nBALTIMOR...,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,738_image_batch_2.png,We're fed up.\nWe're not asking for our countr...,0,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, ..."
2,885_image_batch_2.png,PRESIDENTS' DAY SALE\n\nEVERYONE MUST GO,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,41_image_batch_2.png,TRUDEAU'S PRIORITIES\n1.ENRICHING HIS FRIENDS ...,0,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, ..."
4,169_image.png,Our elders were called to war to save lives.\n...,0,"[0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
182,867_image_batch_2.png,"AT THIS POINT, SHOULDN'T HIS RALLIES BE CONSID...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
183,901_image_batch_2.png,HEY YOU GUYSSSS!\n\nI CAN TRAVEL NOW\nTHE VACC...,0,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
184,932_image_batch_2.png,"President Trump, infected with COVID-19, retur...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."
185,967_image_batch_2.png,Your parades are cute!\n\nWait till you see ou...,0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, ..."


## Save to JSON

In [34]:
training_df.to_json('MultiModal_training_data.json')
test_df.to_json('MultiModal_testing_data.json')

In [35]:
pd.read_json('MultiModal_training_data.json')

Unnamed: 0,Image,Text,Hate,Labels
0,182_image.png,The most costly errors in all of history\n\nWo...,0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ..."
1,366_image_batch_2.png,MY PARENTS WERE KILLED AS A RESULT OF A GUN FR...,0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ..."
2,148_image.png,VOTE IT OUT\nVOTE IT OUT\n\nHE SAID IT WASN'T ...,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
3,58_image_batch_2.png,IF YOU DONT LISTEN TO DESPACITO YOU AIN'T LATI...,2,"[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
4,816_image_batch_2.png,FAIR AND BALANCED\n\nFAUX NEWS\n\nWE DISTORT Y...,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
...,...,...,...,...
722,687_image_batch_2.png,"Asked what the ""D.C."" stands for in ""Washingto...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
723,516_image_batch_2.png,BREAKING NEWS: The Chicago Police Dept has rep...,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
724,45_image_batch_2.png,IF WE GIVE UP EVERYTHING THAT OFFENDS SOMEONE ...,0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
725,86_image_batch_2.png,"YOU'D THINK YOU WERE IN A WHORE HOUSE, SEEING ...",2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, ..."


In [36]:
pd.read_json('MultiModal_training_data.json').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 727 entries, 0 to 726
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Image   727 non-null    object
 1   Text    727 non-null    object
 2   Hate    727 non-null    int64 
 3   Labels  727 non-null    object
dtypes: int64(1), object(3)
memory usage: 28.4+ KB
