In [1]:
import numpy as np
import pandas as pd
import time
import os

In [2]:
def loadNLPVectors(filename):
    file = 'nlp_data/' + filename + '.npy'
    return np.load(file)

In [3]:
def loadLabels():
    return loadNLPVectors("labels")

In [4]:
from sklearn.model_selection import train_test_split
def genData(nlp):
    X_train, X_test, y_train, y_test = train_test_split(nlp, labels,
                                                        test_size = 0.2,
                                                        random_state = 42,
                                                        shuffle = True,
                                                        stratify = labels)
    
    train = [X_train, y_train]
    test = [X_test, y_test]
    
    return train, test

# Load NLP Data

In [5]:
unigram_array = "feature_array_unigram"
bigram_array = "feature_array_bigram"
tfidf_array = "feature_array_tfidf"
wordvec_array = "feature_array_word2vec"
unigram_reduced = "reduced_unigram"
bigram_reduced = "reduced_bigram"
tfidf_reduced = "reduced_tfidf"

In [6]:
unigram = loadNLPVectors(unigram_array)
bigram = loadNLPVectors(bigram_array)
tfidf = loadNLPVectors(tfidf_array)
word2vec = loadNLPVectors(wordvec_array)
reduced_unigram = loadNLPVectors(unigram_reduced)
reduced_bigram = loadNLPVectors(bigram_reduced)
reduced_tfidf = loadNLPVectors(tfidf_reduced)
labels = loadLabels()

# Generate Training and Testing Datasets

In [7]:
train_uni, test_uni = genData(unigram)

In [8]:
train_big, test_big = genData(bigram)

In [9]:
train_tfidf, test_tfidf = genData(tfidf)

In [10]:
train_vec, test_vec = genData(word2vec)

In [11]:
train_runi, test_runi = genData(reduced_unigram)

In [12]:
train_rbig, test_rbig = genData(reduced_bigram)

In [13]:
train_rtfidf, test_rtfidf = genData(reduced_tfidf)

# K-Means Clustering

In [14]:
from sklearn.cluster import KMeans

In [15]:
def kCluster(train, verbose = 0):
    cluster = KMeans(n_clusters = 2, verbose = verbose)
    start = time.time()
    cluster.fit(train[0], train[1])
    end = time.time() - start
    print('Training Time: ' + str(end))
    return cluster

# Evaluate Clusters

In [16]:
def divideTestData(test):
    X = test[0]
    y = test[1]
    
    neutral_array = []
    hate_array = []
    
    for i in range(0, len(y)):
        if y[i] == 0:
            neutral_array.append(X[i])
        elif y[i] == 1:
            hate_array.append(X[i])
    
    neutral = np.asarray(neutral_array)
    hate = np.asarray(hate_array)
    
    return neutral, hate

In [17]:
def getClusterAvg(cluster_pred):
    total = len(cluster_pred)
    count_0 = 0
    count_1 = 0
    for i in range(0, total):
        if cluster_pred[i] == 0:
            count_0 += 1
        elif cluster_pred[i] == 1:
            count_1 += 1
    
    cluster0 = count_0/total
    cluster1 = count_1/total
    
    return cluster0, cluster1

In [18]:
def evaluateCluster(cluster, test):
    neutral, hate = divideTestData(test)
    
    neutral_pred = cluster.predict(neutral)
    hate_pred = cluster.predict(hate)
    
    nCluster0, nCluster1 = getClusterAvg(neutral_pred)
    hCluster0, hCluster1 = getClusterAvg(hate_pred)
    
    print("Neutral Cluster = 0: " + str(nCluster0))
    print("Hate Cluster = 1:    " + str(hCluster1))
    print(" ")
    print("Neutral Cluster = 1: " + str(nCluster1))
    print("Hate Cluster = 0:    " + str(hCluster0))
    

# Reduced Unigram Clustering

In [19]:
k_runi = kCluster(train_runi)

Training Time: 0.6534719467163086


In [20]:
evaluateCluster(k_runi, test_runi)

Neutral Cluster = 0: 0.2983508245877061
Hate Cluster = 1:    0.856353591160221
 
Neutral Cluster = 1: 0.7016491754122939
Hate Cluster = 0:    0.143646408839779


# Reduced Bigram Clustering

In [21]:
k_rbig = kCluster(train_rbig)

Training Time: 1.6098639965057373


In [22]:
evaluateCluster(k_rbig, test_rbig)

Neutral Cluster = 0: 0.7016491754122939
Hate Cluster = 1:    0.143646408839779
 
Neutral Cluster = 1: 0.2983508245877061
Hate Cluster = 0:    0.856353591160221


# Reduced TFIDF Clustering

In [23]:
k_rtfidf = kCluster(train_rtfidf)

Training Time: 1.149717092514038


In [24]:
evaluateCluster(k_rtfidf, test_rtfidf)

Neutral Cluster = 0: 0.0022488755622188904
Hate Cluster = 1:    0.6621941594317285
 
Neutral Cluster = 1: 0.9977511244377811
Hate Cluster = 0:    0.3378058405682715


# Word2Vec Clustering

In [25]:
k_vec = kCluster(train_vec)

Training Time: 0.8785572052001953


In [26]:
evaluateCluster(k_vec, test_vec)

Neutral Cluster = 0: 0.4782608695652174
Hate Cluster = 1:    0.5169692186266772
 
Neutral Cluster = 1: 0.5217391304347826
Hate Cluster = 0:    0.48303078137332284


# Unigram Clustering

In [27]:
k_uni = kCluster(train_uni)

Training Time: 53.41338229179382


In [28]:
evaluateCluster(k_uni, test_uni)

Neutral Cluster = 0: 0.7016491754122939
Hate Cluster = 1:    0.143646408839779
 
Neutral Cluster = 1: 0.2983508245877061
Hate Cluster = 0:    0.856353591160221


# Bigram Clustering

In [29]:
k_big = kCluster(train_big)

Training Time: 2734.345181941986


In [30]:
evaluateCluster(k_big, test_big)

Neutral Cluster = 0: 0.7016491754122939
Hate Cluster = 1:    0.143646408839779
 
Neutral Cluster = 1: 0.2983508245877061
Hate Cluster = 0:    0.856353591160221


# TFIDF Clustering

In [31]:
k_tfidf = kCluster(train_tfidf)

Training Time: 75.32064080238342


In [32]:
evaluateCluster(k_tfidf, test_tfidf)

Neutral Cluster = 0: 0.9985007496251874
Hate Cluster = 1:    0.19889502762430938
 
Neutral Cluster = 1: 0.0014992503748125937
Hate Cluster = 0:    0.8011049723756906
