In [None]:
#import functions and libraries
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import *
from keras.initializers import Constant
import re
import tensorflow as tf
from keras.models import Model
from ipynb.fs.full.datapreprocessing import preProcessDataset

In [None]:
#import cleaned datasets
train, test = pd.read_csv('dataset/train.csv'), pd.read_csv('dataset/test.csv')
train, test = preProcessDataset(train, test)

In [None]:
#tokenize and pad the sentences from processedText matrix
max_features = 31000 #number of words we care about
sequence_length = 256 #number of words to be taken from each sentence
tokenizer = Tokenizer(num_words=max_features, split=' ', oov_token='<unw>', filters=' ')
tokenizer.fit_on_texts(train['processedText'].values)

#create train matrix
xTrain = tokenizer.texts_to_sequences(train['processedText'].values)
xTrain = pad_sequences(xTrain, sequence_length)

#create text matrix
xTest = tokenizer.texts_to_sequences(test['processedText'].values)
xTest = pad_sequences(xTest, sequence_length)

In [None]:
#use GloVe embedding data
embeddings_index = {}
f = open('PATH TO GloVe Text File')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
#map tokens to their respective words
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
#create Embedding matrix from the tokenized sentences
num_words = min(max_features, len(word_index)) + 1
print(num_words)

embedding_dim = 100

# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_index.items():
    if i > max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix
        embedding_matrix[i] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        embedding_matrix[i] = np.random.randn(embedding_dim)

In [None]:
filterSizes = [7,8,9]
numberOfFilters = 512
dropout = 0.3

In [None]:
#create CNN Model
inputs = Input(shape=(sequence_length,),dtype='int32')
embedding = Embedding(num_words,embedding_dim,embeddings_initializer=Constant(embedding_matrix),input_length=sequence_length,trainable=True)(inputs)
reshape = keras.layers.Reshape((sequence_length, embedding_dim,1))(embedding)

conv0 = keras.layers.Conv2D(numberOfFilters, kernel_size=(filterSizes[0], embeddingDimension), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv1 = keras.layers.Conv2D(numberOfFilters, kernel_size=(filterSizes[1], embeddingDimension), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv2 = keras.layers.Conv2D(numberOfFilters, kernel_size=(filterSizes[2], embeddingDimension), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxPool0 = keras.layers.MaxPool2D(pool_size=(sequenceLength-filterSizes[0]+1,1), strides=(1,1), padding='valid')(conv0)
maxPool1 = keras.layers.MaxPool2D(pool_size=(sequenceLength-filterSizes[1]+1,1), strides=(1,1), padding='valid')(conv1)
maxPool2 = keras.layers.MaxPool2D(pool_size=(sequenceLength-filterSizes[2]+1,1), strides=(1,1), padding='valid')(conv2)

concatenatedTensor = keras.layers.Concatenate(axis=1)([maxPool0, maxPool1, maxPool2])
flatten = keras.layers.Flatten()(concatenatedTensor)
dropout = keras.layers.Dropout(dropout)(flatten)
output = keras.layers.Dense(units=1, activation='sigmoid')(dropout)

classifier = keras.Model(inputs=inputs, outputs=output)

In [None]:
#train the model and create predictions and store them in a dictionary
preds = {}
classes = ['Text_Only_Informative','Image_Only_Informative','Directed_Hate','Generalized_Hate','Sarcasm','Allegation','Justification','Refutation','Support','Oppose']
for className in classes:
    print('Training and Prediction for the class: ', className)
    classifier.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer='adam', metrics=[tf.keras.metrics.AUC()])
    classifier.fit(xTrain, train[className], batch_size=50, epochs=7, validation_split=0.2)
    predictions = classifier.predict(xTest)
    pred[className] = predictions

In [None]:
#create submission dataframe and output csv file
a1 = np.array(preds['Text_Only_Informative'])
submission = pd.DataFrame(data=a1, index=test['TweeId'], columns=['Text_Only_Informative'])
submission['Image_Only_Informative'] = preds['Image_Only_Informative']
submission['Directed_Hate'] = preds['Directed_Hate']
submission['Generalized_Hate'] = preds['Generalized_Hate']
submission['Sarcasm'] = preds['Sarcasm']
submission['Allegation'] = preds['Allegation']
submission['Justification'] = preds['Justification']
submission['Refutation'] = preds['Refutation']
submission['Support'] = preds['Support']
submission['Oppose'] = preds['Oppose']
submission.to_csv('dataset/submissionCNN.csv')