In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
def loadCSV(filename):
    file = filename
    if '.csv' not in filename:
        file += '.csv'
    data = pd.read_csv(file, encoding = 'ISO-8859-1')
    return data

# Binary Classification

Switch from Multiclass to Binary Classification

In [None]:
multiClass = [2]
binaryClass = [1]

In [None]:
csvFile = "stemmed_hate_speech"
data = loadCSV(csvFile)
data["tweet_class"] = data["tweet_class"].replace(multiClass, binaryClass)

In [None]:
data.head()

In [None]:
path = os.getcwd()

In [None]:
new_csv_filename = "/binary_classification.csv"
filepath = path + new_csv_filename

In [None]:
data.to_csv(filepath, index = None, header = True)

# Feature Extraction

In [None]:
csvFile = "binary_classification"
data = loadCSV(csvFile)

# Split Tweets into Training and Testing Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
tweets = []
labels = []
for row in range(0, len(data["tweet_text"])):
    tweet = data["tweet_text"][row]
    label = data["tweet_class"][row]
    tweets.append(tweet)
    labels.append(label)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tweets, labels,
                                                    test_size = 0.2,
                                                    random_state = 42,
                                                    shuffle = True,
                                                    stratify = labels)

In [None]:
len(X_train), len(X_test)

In [None]:
len(y_train), len(y_test)

# Encode Text as Unigram

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
unigram_train = CountVectorizer(max_features = 10000)
unigram_train_matrix = unigram_train.fit_transform(X_train)

unigram_test = CountVectorizer(vocabulary = unigram_train.get_feature_names())
unigram_test_matrix = unigram_test.fit_transform(X_test)

In [None]:
unigram_train_matrix

In [None]:
unigram_test_matrix

In [None]:
unigram_train_array = unigram_train_matrix.toarray()
unigram_test_array = unigram_test_matrix.toarray()

# Encode Text as Bigram

In [None]:
bigram_train = CountVectorizer(max_features = 65000, ngram_range = (1,2))
bigram_train_matrix = bigram_train.fit_transform(X_train)

bigram_test = CountVectorizer(ngram_range = (1,2), vocabulary = bigram_train.get_feature_names())
bigram_test_matrix = bigram_test.fit_transform(X_test)

In [None]:
bigram_train_matrix

In [None]:
bigram_test_matrix

In [None]:
bigram_train_array = bigram_train_matrix.toarray()
bigram_test_array = bigram_test_matrix.toarray()

# Encode Text as TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_train = TfidfVectorizer(max_features = 10000)
tfidf_train_matrix = tfidf_train.fit_transform(X_train)

tfidf_test = TfidfVectorizer(vocabulary = tfidf_train.get_feature_names())
tfidf_test_matrix = tfidf_test.fit_transform(X_test)

In [None]:
tfidf_train_matrix

In [None]:
tfidf_test_matrix

In [None]:
tfidf_train_array = tfidf_train_matrix.toarray()
tfidf_test_array = tfidf_test_matrix.toarray()

# Encode Text using Word Embeddings

In [None]:
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings("ignore")

In [None]:
corpus = {}
train_sentences = []
for s in X_train:
    sentence = s.split()
    for w in sentence:
        if not corpus.get(w, False):
            corpus[w] = True
    train_sentences.append(sentence)

In [None]:
test_sentences = []
for s in X_test:
    filtered = []
    sentence = s.split()
    for w in sentence:
        if corpus.get(w, False):
            filtered.append(w)
    if len(filtered) > 0:
        test_sentences.append(filtered)

In [None]:
model = Word2Vec(train_sentences, min_count = 1, size = 100, workers = 3, window = 3, sg = 1)

In [None]:
word2vec_train = []
for i in range(0, len(train_sentences)):
    word2vec_train.append(model[train_sentences[i]][0])

word2vec_test = []
for i in range(0, len(test_sentences)):
    word2vec_test.append(model[test_sentences[i]][0])

In [None]:
word2vec_train_array = np.stack(word2vec_train, axis=0)

In [None]:
word2vec_test_array = np.stack(word2vec_test, axis=0)

In [None]:
len(word2vec_train_array), len(word2vec_test_array[0])

In [None]:
len(word2vec_test_array), len(word2vec_test_array[0])

# Dimensionality Reduction for Sparse Matricies

Feature Reduction for the unigram, bigram, and tf-idf sparse matricies

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
def reduceDim(sparse, n):
    tsvd = TruncatedSVD(n_components = n)
    return tsvd.fit_transform(sparse)

In [None]:
runigram_train_array = reduceDim(unigram_train_array, 200)
runigram_test_array = reduceDim(unigram_test_array, 200)

In [None]:
rbigram_train_array = reduceDim(bigram_train_array, 500)
rbigram_test_array = reduceDim(bigram_test_array, 500)

In [None]:
rtfidf_train_array = reduceDim(tfidf_train_array, 200)
rtfidf_test_array = reduceDim(tfidf_test_array, 200)

#### Comparison of Original Sparse Matricies to Reduced Matricies

In [None]:
(len(unigram_train_array), len(unigram_train_array[0])), (len(runigram_train_array), len(runigram_train_array[0]))

In [None]:
(len(unigram_test_array), len(unigram_test_array[0])), (len(runigram_test_array), len(runigram_test_array[0]))

In [None]:
(len(bigram_train_array), len(bigram_train_array[0])), (len(rbigram_train_array), len(rbigram_train_array[0]))

In [None]:
(len(bigram_test_array), len(bigram_test_array[0])), (len(rbigram_test_array), len(rbigram_test_array[0]))

In [None]:
(len(tfidf_train_array), len(tfidf_train_array[0])), (len(rtfidf_train_array), len(rtfidf_train_array[0]))

In [None]:
(len(tfidf_test_array), len(tfidf_test_array[0])), (len(rtfidf_test_array), len(rtfidf_test_array[0]))

# Saving NumPy Arrays

In [None]:
os.mkdir('train')
os.mkdir('test')
def saveFile(folder, file, data):
    filename = folder + '/' + file + ".npy"
    np.save(filename, data)

In [None]:
saveFile('train', 'unigram', unigram_train_array)
saveFile('train', 'bigram', bigram_train_array)
saveFile('train', 'tfidf', tfidf_train_array)
saveFile('train', 'word2vec', word2vec_train_array)
saveFile('train', 'runigram', runigram_train_array)
saveFile('train', 'rbigram', rbigram_train_array)
saveFile('train', 'rtfidf', rtfidf_train_array)

In [None]:
saveFile('test', 'unigram', unigram_test_array)
saveFile('test', 'bigram', bigram_test_array)
saveFile('test', 'tfidf', tfidf_test_array)
saveFile('test', 'word2vec', word2vec_test_array)
saveFile('test', 'runigram', runigram_test_array)
saveFile('test', 'rbigram', rbigram_test_array)
saveFile('test', 'rtfidf', rtfidf_test_array)

# Save Labels as Numpy Array

In [None]:
saveFile('train', 'labels', y_train)
saveFile('test', 'labels', y_test)