In [1]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC, NuSVC, OneClassSVM, SVC, SVR, l1_min_c
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import gensim
import sys
sys.path.insert(0, '/home/mcunha/Documents/Classes/KW/G0B34a_knowledge_and_the_web/')
import data.ad_hominem.tokenize_df
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

#### Defining a function that will be used later...

In [2]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#### Just cleaned the data frame a bit

In [3]:
fallacies = pd.read_csv("../../data/ad_hominem/ad_hominems_cleaned_Murilo.csv")
fallacies = fallacies.drop(['Unnamed: 0'], axis=1)
fallacies = data.ad_hominem.tokenize_df.preprocess_df(fallacies)
fallacies['reddit_ad_hominem.body'].replace('', np.nan, inplace=True)
fallacies.dropna(subset=['reddit_ad_hominem.body'], inplace=True)
fallacies.reset_index()
train_data, test_data = train_test_split(fallacies, test_size=0.3, random_state=3)

fallacies.tail(n=10)

Unnamed: 0,reddit_ad_hominem.body,reddit_ad_hominem.ad_hominem
29277,we re gonna back him because we won know the d...,0
29278,gender,0
29279,which was alienating and confusing it makes yo...,0
29280,and the such my relationship was my baptize by...,0
29281,because that was label that other can understand,0
29282,it is more effective to report it than downvot...,0
29283,which was alienating and confusing it makes yo...,0
29284,because that was quot label that other,0
29285,times more iron fragments than asbestos,0
29286,ve never heard of supreme court rulings allowi...,0


#### I used the doc2vec from [here](/models/02_doc2vec/doc2vec.ipynb)

In [4]:
print("Loading doc2vec model...")
model = gensim.models.doc2vec.Doc2Vec.load("reddit-doc2vec.model")
print("Done!")

print("Preparing the train data...")
x_train = [gensim.utils.simple_preprocess(i) for i in train_data["reddit_ad_hominem.body"]] # Tokenize and remove stop words, make lower case, etc.
x_train = [model.infer_vector(i) for i in x_train]                                          # Infer vectors
y_train = list(train_data["reddit_ad_hominem.ad_hominem"])
print("Done!")

print("Preparing the test data...")
x_test = [gensim.utils.simple_preprocess(i) for i in test_data["reddit_ad_hominem.body"]] # Tokenize and remove stop words, make lower case, etc.
x_test = [model.infer_vector(i) for i in x_test]                                          # Infer vectors
y_test = list(test_data["reddit_ad_hominem.ad_hominem"])
print("Done!")

Loading doc2vec model...
Done!
Preparing the train data...
Done!
Preparing the test data...
Done!


In [5]:
fallacies.dtypes

reddit_ad_hominem.body          object
reddit_ad_hominem.ad_hominem     int64
dtype: object

In [17]:
x_train = np.array(x_train)
y_train = np.array(y_train)
y_train.shape

(20463,)

In [12]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Conv1D
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import tensorflow as tf

model = Sequential()

model.add(Dense(1024, input_shape=(500,))) # 500 is from the doc2vec model
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [13]:
batch_size = 1024
epochs = 4

In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              513024    
_________________________________________________________________
activation_1 (Activation)    (None, 1024)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               262400    
_________________________________________________________________
activation_2 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
__________

In [18]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 18416 samples, validate on 2047 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
