In [None]:
import numpy as np
import pandas as pd
import time
import os

In [None]:
def loadNLPVectors(filename):
    file = 'nlp_data/' + filename + '.npy'
    return np.load(file)

In [None]:
def loadLabels():
    return loadNLPVectors("labels")

In [None]:
from sklearn.model_selection import train_test_split
def genData(nlp):
    X_train, X_test, y_train, y_test = train_test_split(nlp, labels,
                                                        test_size = 0.2,
                                                        random_state = 42,
                                                        shuffle = True,
                                                        stratify = labels)
    
    train = [X_train, y_train]
    test = [X_test, y_test]
    
    return train, test

# Load NLP Data

In [None]:
unigram_array = "feature_array_unigram"
bigram_array = "feature_array_bigram"
tfidf_array = "feature_array_tfidf"
wordvec_array = "feature_array_word2vec"
unigram_reduced = "reduced_unigram"
bigram_reduced = "reduced_bigram"
tfidf_reduced = "reduced_tfidf"

In [None]:
unigram = loadNLPVectors(unigram_array)
bigram = loadNLPVectors(bigram_array)
tfidf = loadNLPVectors(tfidf_array)
word2vec = loadNLPVectors(wordvec_array)
reduced_unigram = loadNLPVectors(unigram_reduced)
reduced_bigram = loadNLPVectors(bigram_reduced)
reduced_tfidf = loadNLPVectors(tfidf_reduced)
labels = loadLabels()

# Generate Training and Testing Datasets

In [None]:
train_uni, test_uni = genData(unigram)

In [None]:
train_big, test_big = genData(bigram)

In [None]:
train_tfidf, test_tfidf = genData(tfidf)

In [None]:
train_vec, test_vec = genData(word2vec)

In [None]:
train_runi, test_runi = genData(reduced_unigram)

In [None]:
train_rbig, test_rbig = genData(reduced_bigram)

In [None]:
train_rtfidf, test_rtfidf = genData(reduced_tfidf)

# K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
def kCluster(train, verbose = 0):
    cluster = KMeans(n_clusters = 2, verbose = verbose)
    start = time.time()
    cluster.fit(train[0], train[1])
    end = time.time() - start
    print('Training Time: ' + str(end))
    return cluster

# Evaluate Clusters

In [None]:
def divideTestData(test):
    X = test[0]
    y = test[1]
    
    neutral_array = []
    hate_array = []
    
    for i in range(0, len(y)):
        if y[i] == 0:
            neutral_array.append(X[i])
        elif y[i] == 1:
            hate_array.append(X[i])
    
    neutral = np.asarray(neutral_array)
    hate = np.asarray(hate_array)
    
    return neutral, hate

In [None]:
def getClusterAvg(cluster_pred):
    total = len(cluster_pred)
    count_0 = 0
    count_1 = 0
    for i in range(0, total):
        if cluster_pred[i] == 0:
            count_0 += 1
        elif cluster_pred[i] == 1:
            count_1 += 1
    
    cluster0 = count_0/total
    cluster1 = count_1/total
    
    return cluster0, cluster1

In [None]:
def evaluateCluster(cluster, test):
    neutral, hate = divideTestData(test)
    
    neutral_pred = cluster.predict(neutral)
    hate_pred = cluster.predict(hate)
    
    nCluster0, nCluster1 = getClusterAvg(neutral_pred)
    hCluster0, hCluster1 = getClusterAvg(hate_pred)
    
    print("Neutral Cluster = 0: " + str(nCluster0))
    print("Hate Cluster = 1:    " + str(hCluster1))
    print(" ")
    print("Neutral Cluster = 1: " + str(nCluster1))
    print("Hate Cluster = 0:    " + str(hCluster0))
    

# Reduced Unigram Clustering

In [None]:
k_runi = kCluster(train_runi)

In [None]:
evaluateCluster(k_runi, test_runi)

# Reduced Bigram Clustering

In [None]:
k_rbig = kCluster(train_rbig)

In [None]:
evaluateCluster(k_rbig, test_rbig)

# Reduced TFIDF Clustering

In [None]:
k_rtfidf = kCluster(train_rtfidf)

In [None]:
evaluateCluster(k_rtfidf, test_rtfidf)

# Word2Vec Clustering

In [None]:
k_vec = kCluster(train_vec)

In [None]:
evaluateCluster(k_vec, test_vec)

# Unigram Clustering

In [None]:
k_uni = kCluster(train_uni)

In [None]:
evaluateCluster(k_uni, test_uni)

# Bigram Clustering

In [None]:
k_big = kCluster(train_big)

In [None]:
evaluateCluster(k_big, test_big)

# TFIDF Clustering

In [None]:
k_tfidf = kCluster(train_tfidf)

In [None]:
evaluateCluster(k_tfidf, test_tfidf)