In [14]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

general = pd.read_csv("../../data/general/ad_hominem_attacks.csv", sep=";")
adHominem = pd.read_csv("../../data/ad_hominem/reddit_ad_hominem.csv")

general = general[ general["Pieter"] + general["Murilo"] + general["Eric"] >= 2]

In [15]:
adHominem.shape

general["isAdHominem"] = np.where(general["fallacies.df.Intended.Fallacy"] == "Ad Hominem",True, False)
general.head()

Unnamed: 0,fallacies.df.Topic,fallacies.df.Intended.Fallacy,fallacies.df.Text,Eric,Pieter,Murilo,isAdHominem
0,Are humans to blame for certain animal extinct...,No Fallacy,"Yes, human beings have hunted and eaten animal...",1.0,1,1,False
1,Are humans to blame for certain animal extinct...,No Fallacy,Humans are not to be blamed for animal extinct...,0.0,1,1,False
7,Are humans to blame for certain animal extinct...,No Fallacy,Humans don't care enough for living beings.,1.0,1,1,False
9,Are humans to blame for certain animal extinct...,Ad Hominem,Of course. You throw your garbage into the oce...,1.0,1,1,True
11,Are Quentin Tarantinos movies too violent?,Ad Hominem,"Oh now, I'm not going to debate with you... Ha...",1.0,1,1,True


In [20]:
df = pd.concat([adHominem['body'], adHominem['ad_hominem']], axis=1, keys=['body', 'isAdHominem'])
df = df.append(pd.concat([general['fallacies.df.Text'], general['isAdHominem']], axis=1, keys=['body', 'isAdHominem']))

df["body"] = df["body"].astype(str)
df["isAdHominem"] = df["isAdHominem"].astype(bool)
df = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]
print(general.shape)
print(adHominem.shape)
print(df.shape)

(382, 7)
(29281, 22)
(29663, 2)


In [17]:
from sklearn.model_selection import train_test_split

train, test = train_test_split( df, test_size=0.3, random_state=3)

print("In total, the train contains", sum(train["isAdHominem"] == True), "ad hominems")
print("In total, the test contains", sum(test["isAdHominem"] == True), "ad hominems")

In total, the train contains 2765 ad hominems
In total, the test contains 1228 ad hominems


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer( ngram_range = (1, 4))
v.fit(train['body'].values.astype('U'))
x_train = v.transform(train['body'].values.astype('U'))
x_test = v.transform(test['body'].values.astype('U'))

In [19]:
x_train.shape #29663

(20764, 4323660)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
clf = LinearSVC().fit(x_train, train["isAdHominem"])

In [None]:
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import confusion_matrix
import itertools

predicted = clf.predict(x_test)

# Compute confusion matrix
cnf_matrix = confusion_matrix(test["isAdHominem"], predicted)

np.set_printoptions(precision=2)

# Plot normalized confusion matrix
fig = plt.figure()
plot_confusion_matrix(cnf_matrix, classes=["no ad hominem", "ad hominem"],normalize=False, 
                      title="Confusion matrix for Linear SVM")

#print('plot exported as plt_n.png')
#fig.savefig("plt_n.png")

# Plot non-normalized confusion matrix
fig = plt.figure()
plot_confusion_matrix(cnf_matrix, classes=["no ad hominem", "ad hominem"],normalize=True, 
                      title="Confusion matrix for Naive Bayes classifier")



In [None]:
v.vocabulary_