In [1]:
import numpy as np
import pandas as pd
import time
import os

In [2]:
def loadNLPVectors(folder, filename):
    file = folder + '/' + filename + '.npy'
    return np.load(file)

In [3]:
def loadData(name):
    X_train = loadNLPVectors('train', name)
    y_train = loadNLPVectors('train', 'labels')
    
    X_test = loadNLPVectors('test', name)
    y_test = loadNLPVectors('test', 'labels')
    
    train = [X_train, y_train]
    test = [X_test, y_test]
    
    return train, test

# Load NLP Training/Testing Data

In [4]:
train_uni, test_uni = loadData('unigram')

In [5]:
train_big, test_big = loadData('bigram')

In [6]:
train_tfidf, test_tfidf = loadData('tfidf')

In [7]:
train_vec, test_vec = loadData('word2vec')

In [8]:
train_runi, test_runi = loadData('runigram')

In [9]:
train_rbig, test_rbig = loadData('rbigram')

In [10]:
train_rtfidf, test_rtfidf = loadData('rtfidf')

# K-Means Clustering

In [11]:
from sklearn.cluster import KMeans

In [12]:
def kCluster(train, verbose = 0):
    cluster = KMeans(n_clusters = 2, verbose = verbose)
    start = time.time()
    cluster.fit(train[0], train[1])
    end = time.time() - start
    print('Training Time: ' + str(end))
    return cluster

# Evaluate Clusters

In [13]:
from sklearn.metrics import accuracy_score

In [14]:
def divideTestData(test):
    X = test[0]
    y = test[1]
    
    neutral_array = []
    hate_array = []
    
    for i in range(0, len(y)):
        if y[i] == 0:
            neutral_array.append(X[i])
        elif y[i] == 1:
            hate_array.append(X[i])
    
    neutral = np.asarray(neutral_array)
    hate = np.asarray(hate_array)
    
    return neutral, hate

In [15]:
def getClusterAvg(cluster_pred):
    total = len(cluster_pred)
    count_0 = 0
    count_1 = 0
    for i in range(0, total):
        if cluster_pred[i] == 0:
            count_0 += 1
        elif cluster_pred[i] == 1:
            count_1 += 1
    
    cluster0 = count_0/total
    cluster1 = count_1/total
    
    return cluster0, cluster1

In [16]:
def evaluateCluster(cluster, test):
    neutral, hate = divideTestData(test)
    
    neutral_pred = cluster.predict(neutral)
    hate_pred = cluster.predict(hate)
    
    nCluster0, nCluster1 = getClusterAvg(neutral_pred)
    hCluster0, hCluster1 = getClusterAvg(hate_pred)
    
    print("Neutral Cluster = 0: " + str(nCluster0))
    print("Hate Cluster = 1:    " + str(hCluster1))
    print(" ")
    print("Neutral Cluster = 1: " + str(nCluster1))
    print("Hate Cluster = 0:    " + str(hCluster0))
    

# Reduced Unigram Clustering

In [17]:
k_runi = kCluster(train_runi)

Training Time: 0.5228478908538818


In [18]:
evaluateCluster(k_runi, test_runi)

Neutral Cluster = 0: 0.29910044977511246
Hate Cluster = 1:    0.8571428571428571
 
Neutral Cluster = 1: 0.7008995502248876
Hate Cluster = 0:    0.14285714285714285


# Reduced Bigram Clustering

In [19]:
k_rbig = kCluster(train_rbig)

Training Time: 1.2798261642456055


In [20]:
evaluateCluster(k_rbig, test_rbig)

Neutral Cluster = 0: 0.7008995502248876
Hate Cluster = 1:    0.14285714285714285
 
Neutral Cluster = 1: 0.29910044977511246
Hate Cluster = 0:    0.8571428571428571


# Reduced TFIDF Clustering

In [21]:
k_rtfidf = kCluster(train_rtfidf)

Training Time: 0.7897310256958008


In [22]:
evaluateCluster(k_rtfidf, test_rtfidf)

Neutral Cluster = 0: 0.9985007496251874
Hate Cluster = 1:    0.27387529597474347
 
Neutral Cluster = 1: 0.0014992503748125937
Hate Cluster = 0:    0.7261247040252565


# Word2Vec Clustering

In [23]:
k_vec = kCluster(train_vec)

Training Time: 0.2651479244232178


In [24]:
evaluateCluster(k_vec, test_vec)

Neutral Cluster = 0: 0.37181409295352325
Hate Cluster = 1:    0.5240726124704025
 
Neutral Cluster = 1: 0.6281859070464768
Hate Cluster = 0:    0.47592738752959746


# Unigram Clustering

In [25]:
k_uni = kCluster(train_uni)

Training Time: 33.48102378845215


In [26]:
evaluateCluster(k_uni, test_uni)

Neutral Cluster = 0: 0.2983508245877061
Hate Cluster = 1:    0.856353591160221
 
Neutral Cluster = 1: 0.7016491754122939
Hate Cluster = 0:    0.143646408839779


# Bigram Clustering

In [27]:
k_big = kCluster(train_big)

Training Time: 1074.6983239650726


In [28]:
evaluateCluster(k_big, test_big)

Neutral Cluster = 0: 0.7016491754122939
Hate Cluster = 1:    0.143646408839779
 
Neutral Cluster = 1: 0.2983508245877061
Hate Cluster = 0:    0.856353591160221


# TFIDF Clustering

In [29]:
k_tfidf = kCluster(train_tfidf)

Training Time: 43.230924129486084


In [30]:
evaluateCluster(k_tfidf, test_tfidf)

Neutral Cluster = 0: 0.9977511244377811
Hate Cluster = 1:    0.21310181531176006
 
Neutral Cluster = 1: 0.0022488755622188904
Hate Cluster = 0:    0.7868981846882399
