In [131]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

general = pd.read_csv("../../data/general/ad_hominem_attacks.csv", sep=";")
adHominem = pd.read_csv("../../data/ad_hominem/reddit_ad_hominem.csv")

general = general[ general["Pieter"] + general["Murilo"] + general["Eric"] >= 2]

In [132]:
adHominem.shape

general["isAdHominem"] = np.where(general["fallacies.df.Intended.Fallacy"] == "Ad Hominem",True, False)
general.head()

Unnamed: 0,fallacies.df.Topic,fallacies.df.Intended.Fallacy,fallacies.df.Text,Eric,Pieter,Murilo,isAdHominem
0,Are humans to blame for certain animal extinct...,No Fallacy,"Yes, human beings have hunted and eaten animal...",1.0,1,1,False
1,Are humans to blame for certain animal extinct...,No Fallacy,Humans are not to be blamed for animal extinct...,0.0,1,1,False
7,Are humans to blame for certain animal extinct...,No Fallacy,Humans don't care enough for living beings.,1.0,1,1,False
9,Are humans to blame for certain animal extinct...,Ad Hominem,Of course. You throw your garbage into the oce...,1.0,1,1,True
11,Are Quentin Tarantinos movies too violent?,Ad Hominem,"Oh now, I'm not going to debate with you... Ha...",1.0,1,1,True


In [232]:
df = pd.concat([adHominem['body'], adHominem['ad_hominem']], axis=1, keys=['body', 'isAdHominem'])
df = df.append(pd.concat([general['fallacies.df.Text'], general['isAdHominem']], axis=1, keys=['body', 'isAdHominem']))
print(general.shape)
print(adHominem.shape)
print(df.shape)

df["body"] = df["body"].astype(str)
df["isAdHominem"] = df["isAdHominem"].astype(bool)
df = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]

(382, 7)
(29281, 22)
(29663, 2)


In [233]:
from sklearn.model_selection import train_test_split

train, test = train_test_split( df, test_size=0.3, random_state=3)

print("In total, the train contains", sum(train["isAdHominem"] == True), "ad hominems")
print("In total, the test contains", sum(test["isAdHominem"] == True), "ad hominems")

In total, the train contains 2765 ad hominems
In total, the test contains 1228 ad hominems


In [216]:
from keras import utils
from keras.preprocessing import text, sequence

vocab_size = 100000

tokenize = text.Tokenizer(num_words=vocab_size)
#tokenize.fit_on_texts(result.headline_text)

tokenize.fit_on_texts(df["body"]) # only fit on train
x_train = tokenize.texts_to_matrix(train["body"])
x_test = tokenize.texts_to_matrix(test["body"])

In [217]:
df.dtypes

body           object
isAdHominem      bool
dtype: object

In [218]:
x_train.shape

(267, 100000)

In [219]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Conv1D
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import tensorflow as tf

model = Sequential()

model.add(Dense(1024, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [220]:
batch_size = 1024
epochs = 4

In [221]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_26 (Dense)             (None, 1024)              102401024 
_________________________________________________________________
activation_25 (Activation)   (None, 1024)              0         
_________________________________________________________________
dropout_14 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_27 (Dense)             (None, 256)               262400    
_________________________________________________________________
activation_26 (Activation)   (None, 256)               0         
_________________________________________________________________
dropout_15 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 257       
__________

In [222]:
history = model.fit(x_train, train["isAdHominem"],
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 240 samples, validate on 27 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
