# Selecting the examples for analisys
Long, medium and short text entries


In [47]:
import numpy as np
import pandas as pd
import random
from gensim.utils import simple_preprocess
import matplotlib.pyplot as plt

#### Defining a function that will be used later...

In [48]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

## Importing data

In [49]:
df = pd.read_csv("../../data/ad_hominem/ad_hominems_cleaned_Murilo.csv", sep=",", index_col=0, header=0, names=["body", "isAdHominem"])
df = df[~df.isin([np.nan, np.inf, -np.inf, 'nan']).any(1)] ## Remove rows with NaN values
print(df.shape)

(29218, 2)


In [50]:
df["length"] = df["body"].apply(lambda x: len(simple_preprocess(x, deacc=True))) # Make row for length
df = df.reset_index(drop=True)

In [51]:
pd.set_option('display.max_colwidth', 0)
df.head(n=5)

Unnamed: 0,body,isAdHominem,length
0,What makes corporations different in this case? They have interests too.,0,11
1,"I'm sorry if your smugness gets in the way. Like I said elsewhere in this thread. Somolia is not close to anything I advocate for so why on earth would I move there? Any time the Somolia ""argument"" is brought up, I instantly know I'm dealing with someone who refuses to learn the difference between a voluntary society and a third world country ravaged by warlords and foreign policies of other countries. If you want a thoughtful response to an argument, make sure you're not comparing Antarctica to the Bahamas. Otherwise, take your circlejerk, ""arguments"" elsewhere. You have contributed absolutely nothing to this thread but ad hominem Attacks and the typical liberal/conservative talking points and almost everyone in here knows it.",1,114
2,"Basically to believe a patriarchy exists, you must believe that men are maintaining a system of oppression against women, despite knowing the harm it does to both women and men.EG - Wanting to maintain a system that, among other things, condones severe anti-male bias in all facets of the legal system, simply isn't rational. Thus men, being the ones in power, want to oppress women so much they are willing to harm themselves to do it. It'd be like cutting off your own arm so you had something to club someone with.A long time ago one could say it was ignorance, but with how mainstream feminism thoughts are today this can no longer be true. So the actions of men to maintain the patriarchy must also be willful.How can a person believe this, and not hate men?",0,135
3,The punishment for heresy was being burned at the stake.,0,10
4,No it doesn't. Sex is defined by DNA. DNA cannot be changed from male to female. A sex change is putting lipstick on a pig. It may look different but it's still a pig,0,31


In [52]:
ilong_true = df.loc[(df["length"] > 300) & (df["length"] < 400) & (df['isAdHominem'] == 1)].sample(n=1).index[0]
ilong_false = df.loc[(df["length"] > 300) & (df["length"] < 400) & (df['isAdHominem'] == 0)].sample(n=1).index[0]
imed_true = df.loc[(df["length"] > 100) & (df["length"] < 150) & (df['isAdHominem'] == 1)].sample(n=1).index[0]
imed_false = df.loc[(df["length"] > 100) & (df["length"] < 150) & (df['isAdHominem'] == 0)].sample(n=1).index[0]
ishort_true = df.loc[(df["length"] > 10)  & (df["length"] < 20)  & (df['isAdHominem'] == 1)].sample(n=1).index[0]
ishort_false = df.loc[(df["length"] > 10)  & (df["length"] < 20)  & (df['isAdHominem'] == 0)].sample(n=1).index[0]
print("The indexes for the examples picked (in the original data frame) are {}, {}, {}, {}, {} and {}.".format(ilong_true, ilong_false, imed_true, imed_false, ishort_true, ishort_false))

The indexes for the examples picked (in the original data frame) are 6608, 14258, 12268, 15433, 3798 and 7773.


In [53]:
indexes = [ilong_true, ilong_false, imed_true, imed_false, ishort_true, ishort_false]
df_samples = df.loc[indexes,:]

## Filtering the dataset

In [54]:
df_samples["body"] = df_samples["body"].apply(lambda x: " ".join(simple_preprocess(str(x), deacc=True))) # Remove stop words, special characters, make everything lower case, etc.
df_samples = df_samples.reset_index(drop=True).reindex(["length", "body", "isAdHominem"], axis=1)        # Reset new indexes for data frame and reorder columns (visualization purposes)
df_samples

Unnamed: 0,length,body,isAdHominem
0,391,there no need to debate it just sent you the meriam webster definition so take it up with them interesting love for you to show me where said that but digress trump then goes on to suggest border wall stretching over the entire border not only is this inefficient but it is waste of money and there are number of arguments against wall you have to build it over rivers people homes and property rough terrain this is bad solution and it is absolutely bigoted think you should work on your reading comprehension skills posted my exact quote did never at any point say what you are wrongly attributing to me in fact in my response to you to clarify specifically noted that clearly you didn read it though so ll post it again lastly if you personally believe that trump border wall is good policy love to hear from you and ll gladly debate you on it don think you re necessarily bigot you might be for supporting the border wall but trump campaign promise for border wall was bigotry in my opinion and if you tell me you want wall to keep out those drug dealer murder rapists think you were bigot too you use strawmans over and over to argue with me kind of sad ll try and clarify again for you you re absolutely right if trump supports border wall because he is intolerant of mexicans that doesn make the border wall bad that is fallacy the border wall is bad for the many reasons posted and more but if trump supports border wall because he is intolerant yes he is being bigoted his reasons are not logical but racist that is bigotry and we know this because he said the reason for the border wall was that illegal mexican immigrants are drug dealers racists murderers that bigotry don think all trump supporters or all people who support the wall are bigoted they might be can read millions of people minds there is nothing inherent about wall that is racist or bigoted but if you support the wall because you don like mexicans that is bigotry if you support the wall because you think all illegal immigrants from mexico are drug dealers rapists murderers that is bigotry hopefully this helps you catch up and quit playing strawman,1
1,365,bullying is defined as the act of physical or emotional violence towards others at first glance that sounds bad and even looks bad when you see it between strangers teachers have campaigned to stop bullying and there have been expensive programs to curb this animalistic act here to argue that bullying is actually beneficial to not just society but to the individuals themselves the society as we all know not all humans are equal bullying is way for the fitter humans to rise above weaker humans this leads to gradual increase in fittness of the entire human race over time the fitter humans are given more power resources and mating options which benefits the entire human race over time this also lets families cut their losses early because if they have child not able to survive bullying then the child would be less likely to survive the environment and it would be waste of resources to invest in the doomed child evolutionarily speaking of course not morally the individual it may seem obvious that younger brother being bullied by his older brother is horrible and bad for the family unit as whole but what does the younger sibling do when he bullied does he cry yes does he hate his brother yes does he also go out and try to become better man to not be bullied anymore yes so even though bullying hurts the child emotionally and physicially in the short term the child actually makes physical and emotional gains in the long term because to not make those gains will just lead to more bullying the bullied child will actually become better version of his parallel self that was not bullied in parallel universe he ll be emotionally harder physicially stronger etc this is just the tip of the iceberg of course but should be enough to generilize my views to change them please prove that bullying has net negative effect on society and the bullied individual know bullying can cause negatives like longterm injuries that were meant to be short term but no system is perfect so please look at the average macro scale here maybe there are negatives that overlooking though,0
2,105,was just making sure you understood the word imply is very similar to the word suggest because that exactly what the ad does it states there are enemies it states they are violent and dangerous it states they should be fought with the clenched fist of truth it states the viewer should join that fight it follows those statements by reminding the viewer that this nice message of division comes from an organization centered around objects used to shoot people nice ad hominem though classy oh and playing that you didn understand my question to make me look dumb clever rhetorical strategies you have there,1
3,142,that kind of ridiculous thing to say by bringing guns to sit in they have threatened to shoot people they occupied federal building they are there illegally and it is the duty of the police to remove them they have forced showdown with law enforcement and they brought guns to it what non violent purpose do you have to bring guns to sit in either because want to shoot people or to make people afraid that you will shoot them if your goal is to make people afraid you will shoot them regardless of your personal subjective intent to pull the trigger you are outright threatening people you don get to use guns to defend yourself from law enforcement that is doing exactly their job if selling drugs and police officer tries to arrest me it not self defense if shoot him,0
4,14,ve been going to church all my life and you sound little grouchy too,1
5,15,number of times he actually grabbed woman by her pussy zerohe was stroking his ego,0


## Neural Network/TFIDF
From [here](../02_tfidf/neural_network.ipynb).

In [55]:
from keras import utils
from keras.preprocessing import text, sequence
from sklearn.model_selection import train_test_split
from keras.models import model_from_json

vocab_size = 3000

tokenize = text.Tokenizer(num_words=vocab_size)
#tokenize.fit_on_texts(result.headline_text)

tokenize.fit_on_texts(df_samples["body"]) # only fit on train
x_test = tokenize.texts_to_matrix(df_samples["body"])
x_test.shape

(6, 3000)

In [56]:
# load json and create model
json_file = open('../02_tfidf/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("../02_tfidf/model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#score = loaded_model.evaluate(x_test, df_samples['isAdHominem'], verbose=0)
ynew = loaded_model.predict_classes(x_test)

Loaded model from disk


In [57]:
df_samples["NN/TFIDF"] = ynew
df_samples

Unnamed: 0,length,body,isAdHominem,NN/TFIDF
0,391,there no need to debate it just sent you the meriam webster definition so take it up with them interesting love for you to show me where said that but digress trump then goes on to suggest border wall stretching over the entire border not only is this inefficient but it is waste of money and there are number of arguments against wall you have to build it over rivers people homes and property rough terrain this is bad solution and it is absolutely bigoted think you should work on your reading comprehension skills posted my exact quote did never at any point say what you are wrongly attributing to me in fact in my response to you to clarify specifically noted that clearly you didn read it though so ll post it again lastly if you personally believe that trump border wall is good policy love to hear from you and ll gladly debate you on it don think you re necessarily bigot you might be for supporting the border wall but trump campaign promise for border wall was bigotry in my opinion and if you tell me you want wall to keep out those drug dealer murder rapists think you were bigot too you use strawmans over and over to argue with me kind of sad ll try and clarify again for you you re absolutely right if trump supports border wall because he is intolerant of mexicans that doesn make the border wall bad that is fallacy the border wall is bad for the many reasons posted and more but if trump supports border wall because he is intolerant yes he is being bigoted his reasons are not logical but racist that is bigotry and we know this because he said the reason for the border wall was that illegal mexican immigrants are drug dealers racists murderers that bigotry don think all trump supporters or all people who support the wall are bigoted they might be can read millions of people minds there is nothing inherent about wall that is racist or bigoted but if you support the wall because you don like mexicans that is bigotry if you support the wall because you think all illegal immigrants from mexico are drug dealers rapists murderers that is bigotry hopefully this helps you catch up and quit playing strawman,1,1
1,365,bullying is defined as the act of physical or emotional violence towards others at first glance that sounds bad and even looks bad when you see it between strangers teachers have campaigned to stop bullying and there have been expensive programs to curb this animalistic act here to argue that bullying is actually beneficial to not just society but to the individuals themselves the society as we all know not all humans are equal bullying is way for the fitter humans to rise above weaker humans this leads to gradual increase in fittness of the entire human race over time the fitter humans are given more power resources and mating options which benefits the entire human race over time this also lets families cut their losses early because if they have child not able to survive bullying then the child would be less likely to survive the environment and it would be waste of resources to invest in the doomed child evolutionarily speaking of course not morally the individual it may seem obvious that younger brother being bullied by his older brother is horrible and bad for the family unit as whole but what does the younger sibling do when he bullied does he cry yes does he hate his brother yes does he also go out and try to become better man to not be bullied anymore yes so even though bullying hurts the child emotionally and physicially in the short term the child actually makes physical and emotional gains in the long term because to not make those gains will just lead to more bullying the bullied child will actually become better version of his parallel self that was not bullied in parallel universe he ll be emotionally harder physicially stronger etc this is just the tip of the iceberg of course but should be enough to generilize my views to change them please prove that bullying has net negative effect on society and the bullied individual know bullying can cause negatives like longterm injuries that were meant to be short term but no system is perfect so please look at the average macro scale here maybe there are negatives that overlooking though,0,1
2,105,was just making sure you understood the word imply is very similar to the word suggest because that exactly what the ad does it states there are enemies it states they are violent and dangerous it states they should be fought with the clenched fist of truth it states the viewer should join that fight it follows those statements by reminding the viewer that this nice message of division comes from an organization centered around objects used to shoot people nice ad hominem though classy oh and playing that you didn understand my question to make me look dumb clever rhetorical strategies you have there,1,1
3,142,that kind of ridiculous thing to say by bringing guns to sit in they have threatened to shoot people they occupied federal building they are there illegally and it is the duty of the police to remove them they have forced showdown with law enforcement and they brought guns to it what non violent purpose do you have to bring guns to sit in either because want to shoot people or to make people afraid that you will shoot them if your goal is to make people afraid you will shoot them regardless of your personal subjective intent to pull the trigger you are outright threatening people you don get to use guns to defend yourself from law enforcement that is doing exactly their job if selling drugs and police officer tries to arrest me it not self defense if shoot him,0,1
4,14,ve been going to church all my life and you sound little grouchy too,1,1
5,15,number of times he actually grabbed woman by her pussy zerohe was stroking his ego,0,1


## Neural Network/Word2Vec
From [here](../02_tfidf/tfidf.ipynb).

In [58]:
# I can't run it on my local computer due to memory limitations

## LinearSVC/Word2Vec
From [here](../02_tfidf/tfidf.ipynb).

In [59]:
# I can't run it on my local computer due to memory limitations

# SVMs/TFIDF

SVMs tested:
* `svm.NuSVC([nu, kernel, degree, gamma, …])`: Nu-Support Vector Classification.
* `svm.SVC([C, kernel, degree, gamma, coef0, …])`: C-Support Vector Classification.
* `svm.LinearSVR([epsilon, tol, C, loss, …])`: Linear Support Vector Regression.

Kernels tested:
* `linear`
* `poly`
* `sigmoid`
* `rbf`

From [here](../05_SVM/SVMs-kernels.ipynb).

In [60]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC, NuSVC, OneClassSVM, SVC, SVR, l1_min_c
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import gensim
import sys
sys.path.insert(0, '/home/mcunha/Documents/Classes/KW/G0B34a_knowledge_and_the_web/')
import data.ad_hominem.tokenize_df
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

#### TdidfVectorizer used from [here](../02_tfidf/tfidf.ipynb).

In [66]:
v = TfidfVectorizer(ngram_range = (1, 1), max_features=3000)
desired_indices = [i for i in range(len(df.index)) if i not in indexes]
df_notInSamples = df.iloc[desired_indices]

train_data, test_data = train_test_split(df_notInSamples, test_size=0.3, random_state=3)
v.fit(train_data['body'].values.astype('U'))

x_train = v.transform(train_data['body'].values.astype('U'))
y_train = list(train_data["isAdHominem"])

x_test = v.transform(df_samples['body'].values.astype('U'))
y_test = list(df_samples["isAdHominem"])

## NuSVC/TFIDF
The kernels to be used are:
* `linear`
* `poly`
* `sigmoid`
* `rbf`

The kernel `rbf` was used in the [SVMs.ipynb](./SVMs.ipynb). Here for comparison.

In [70]:
%%time

print("Fitting NuSVC model...")
nuModel = NuSVC(nu=0.05, kernel='linear').fit(x_train, y_train)
print("Done!")

predicted = nuModel.predict(x_test)
df_samples["NuSVC-linear/TFIDF"] = predicted

Fitting NuSVC model...
Done!
CPU times: user 3min 3s, sys: 165 ms, total: 3min 3s
Wall time: 3min 3s


In [71]:
%%time

print("Fitting NuSVC model...")
nuModel = NuSVC(nu=0.05, kernel='poly').fit(x_train, y_train)
print("Done!")

predicted = nuModel.predict(x_test)

df_samples["NuSVC-poly/TFIDF"] = predicted

Fitting NuSVC model...
Done!
CPU times: user 9.91 s, sys: 4.02 ms, total: 9.92 s
Wall time: 9.91 s


In [72]:
%%time

print("Fitting NuSVC model...")
nuModel = NuSVC(nu=0.05, kernel='sigmoid').fit(x_train, y_train)
print("Done!")

predicted = nuModel.predict(x_test)

df_samples["NuSVC-sigmoid/TFIDF"] = predicted

Fitting NuSVC model...
Done!
CPU times: user 12 s, sys: 4.5 ms, total: 12 s
Wall time: 12 s


In [73]:
%%time

print("Fitting NuSVC model...")
nuModel = NuSVC(nu=0.05, kernel='rbf').fit(x_train, y_train)
print("Done!")

predicted = nuModel.predict(x_test)

df_samples["NuSVC-rbf/TFIDF"] = predicted

Fitting NuSVC model...
Done!
CPU times: user 13.5 s, sys: 164 µs, total: 13.5 s
Wall time: 13.5 s


## SVC
Theoretically equivalent to other methods (LinearSVC and NuSVC), but uses different implementations.
* `LinearSVC` is equivalent to `SVC(kernel = 'linear')`
* From documentation: *`SVC` and `NuSVC` are similar methods, but accept slightly different sets of parameters and have different mathematical formulations (see section [Mathematical formulation](https://scikit-learn.org/stable/modules/svm.html#svm-mathematical-formulation))

The kernels to be used are:
* `linear`
* `poly`
* `sigmoid`
* `rbf`

In [74]:
%%time

print("Fitting SVC model...")
svcModel = SVC(kernel='poly').fit(x_train, y_train)
print("Done!")

predicted = svcModel.predict(x_test)

df_samples["SVC-poly/TFIDF"] = predicted

Fitting SVC model...
Done!
CPU times: user 35.2 s, sys: 51.8 ms, total: 35.2 s
Wall time: 35.4 s


In [75]:
%%time

print("Fitting SVC model...")
svcModel = SVC(kernel='sigmoid').fit(x_train, y_train)
print("Done!")

predicted = svcModel.predict(x_test)

df_samples["SVC-sigmoid/TFIDF"] = predicted

Fitting SVC model...
Done!
CPU times: user 37.1 s, sys: 76.4 ms, total: 37.2 s
Wall time: 37.5 s


In [76]:
%%time

print("Fitting SVC model...")
svcModel = SVC(kernel='rbf').fit(x_train, y_train)
print("Done!")

predicted = svcModel.predict(x_test)

df_samples["SVC-rbf/TFIDF"] = predicted

Fitting SVC model...
Done!
CPU times: user 39.2 s, sys: 87.2 ms, total: 39.3 s
Wall time: 39.6 s


In [77]:
%%time

print("Fitting SVC model...")
svcModel = SVC(kernel='linear').fit(x_train, y_train)
print("Done!")

predicted = svcModel.predict(x_test)

df_samples["SVC-linear/TFIDF"] = predicted

Fitting SVC model...
Done!
CPU times: user 59.4 s, sys: 207 ms, total: 59.6 s
Wall time: 1min


## LinearSVC/TFIDF
As seen before, it is equivalent to `SVC(kernel='linear'`, with implementation differences.

In [78]:
%%time

print("Fitting linear model...")
linearModel = LinearSVC().fit(x_train, y_train)
print("Done!")

predicted = linearModel.predict(x_test)

df_samples["linearSVC/TFIDF"] = predicted

Fitting linear model...
Done!
CPU times: user 258 ms, sys: 7.81 ms, total: 266 ms
Wall time: 282 ms


In [82]:
pd.set_option('display.max_colwidth', 30)
df_samples

Unnamed: 0,length,body,isAdHominem,NN/TFIDF,NuSVC-linear/TFIDF,NuSVC-poly/TFIDF,NuSVC-sigmoid/TFIDF,NuSVC-rbf/TFIDF,SVC-poly/TFIDF,SVC-sigmoid/TFIDF,SVC-rbf/TFIDF,SVC-linear/TFIDF,linearSVC/TFIDF
0,391,there no need to debate it...,1,1,1,0,0,0,0,0,0,0,0
1,365,bullying is defined as the...,0,1,0,0,0,0,0,0,0,0,0
2,105,was just making sure you u...,1,1,0,0,0,1,0,0,0,0,0
3,142,that kind of ridiculous th...,0,1,0,0,0,1,0,0,0,0,0
4,14,ve been going to church al...,1,1,0,0,0,1,0,0,0,0,0
5,15,number of times he actuall...,0,1,0,0,0,0,0,0,0,0,0


## Mixed Neural Network/Word2Vec + POS tags + Doc2Vec

In [84]:
# Can't run locally due to memory limitations.

In [85]:
x_train #doc2vec: vector_size=500

<20448x3000 sparse matrix of type '<class 'numpy.float64'>'
	with 796531 stored elements in Compressed Sparse Row format>