In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
def loadCSV(filename):
    file = filename
    if '.csv' not in filename:
        file += '.csv'
    data = pd.read_csv(file, encoding = 'ISO-8859-1')
    return data

In [3]:
csvFile = "stemmed_hate_speech"
data = loadCSV(csvFile)

In [4]:
data.head()

Unnamed: 0,tweet_text,confidence,tweet_class
0,warn penni board make faggot,0.6013,1
1,fuck dyke,0.7227,2
2,[@] [@] [@] [@] [@] least look like jefre star...,0.5229,2
3,[@] [@] [@] fag jacki jealou neeeee,0.5184,2
4,[@] heard bitch way back th texa wtf talk bitc...,0.5185,1


In [5]:
data.tail()

Unnamed: 0,tweet_text,confidence,tweet_class
13000,sorri offend white supremacist aryan nation ne...,0.3418,0
13001,[@] caucasian euro aryan whatev realli doesnt ...,0.6804,0
13002,[@] sir patient name aryan khan villag meeranp...,1.0,0
13003,[@] happi birthday bro happi year ahead,1.0,0
13004,[@] aryan kapoor cute name tho want kamp first...,1.0,0


# Encode Text as Unigram

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
tweets = []
for row in range(0, len(data["tweet_text"])):
    tweet = data["tweet_text"][row]
    tweets.append(tweet)

In [8]:
len(tweets)

13005

In [9]:
unigram = CountVectorizer()

In [10]:
feature_matrix_unigram = unigram.fit_transform(tweets)

In [11]:
unigram_vocabulary = []
for ngram in unigram.get_feature_names():
    unigram_vocabulary.append(ngram)

In [12]:
len(unigram_vocabulary)

13382

In [13]:
feature_matrix_unigram

<13005x13382 sparse matrix of type '<class 'numpy.int64'>'
	with 96827 stored elements in Compressed Sparse Row format>

# Encode Text as Bigram

In [14]:
bigram = CountVectorizer(ngram_range = (1,2))

In [15]:
feature_matrix_bigram = bigram.fit_transform(tweets)

In [16]:
bigram_vocabulary = []
for ngram in bigram.get_feature_names():
    bigram_vocabulary.append(ngram)

In [17]:
len(bigram_vocabulary)

79813

In [18]:
feature_matrix_bigram

<13005x79813 sparse matrix of type '<class 'numpy.int64'>'
	with 183898 stored elements in Compressed Sparse Row format>

# Encode Text using TF-IDF

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
tfidf = TfidfVectorizer()

In [21]:
feature_matrix_tfidf = tfidf.fit_transform(tweets)

In [22]:
len(tfidf.vocabulary_)

13382

In [23]:
feature_matrix_tfidf

<13005x13382 sparse matrix of type '<class 'numpy.float64'>'
	with 96827 stored elements in Compressed Sparse Row format>

# Encode Text using Word Embeddings

In [24]:
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings("ignore")

In [25]:
feature_text = data[["tweet_text"]]
processed_feature_text = feature_text.apply(lambda x: ','.join(x.astype(str)), axis=1)
clean_feature_text = pd.DataFrame({"text": processed_feature_text})
feature_frame = [row.split(',') for row in clean_feature_text['text']]

In [26]:
model = Word2Vec(feature_frame, min_count = 1, size = 50, workers = 3, window = 3, sg = 1)

In [27]:
feature_array = []
for i in range(0, len(feature_frame)):
    feature_array.append(model[feature_frame[i]][0])

In [28]:
feature_array_word2vec = np.stack(feature_array, axis=0)

# Comparison of NLP Feature Structures

### Unigram

In [29]:
feature_matrix_unigram

<13005x13382 sparse matrix of type '<class 'numpy.int64'>'
	with 96827 stored elements in Compressed Sparse Row format>

In [30]:
feature_array_unigram = feature_matrix_unigram.toarray()

In [31]:
feature_array_unigram[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [32]:
len(feature_array_unigram)

13005

In [33]:
len(feature_array_unigram[0])

13382

### Bigram

In [34]:
feature_matrix_bigram

<13005x79813 sparse matrix of type '<class 'numpy.int64'>'
	with 183898 stored elements in Compressed Sparse Row format>

In [35]:
feature_array_bigram = feature_matrix_bigram.toarray()

In [36]:
feature_array_bigram[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [37]:
len(feature_array_bigram)

13005

In [38]:
len(feature_array_bigram[0])

79813

### Tf-Idf

In [39]:
feature_matrix_tfidf

<13005x13382 sparse matrix of type '<class 'numpy.float64'>'
	with 96827 stored elements in Compressed Sparse Row format>

In [40]:
feature_array_tfidf = feature_matrix_tfidf.toarray()

In [41]:
feature_array_tfidf[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [42]:
len(feature_array_tfidf)

13005

In [43]:
len(feature_array_tfidf[0])

13382

### Word Embedding: Word2Vec

In [44]:
feature_array_word2vec[0]

array([ 0.00698747, -0.00989458,  0.00629319, -0.00305483, -0.00882803,
       -0.00981694, -0.00609471,  0.00091188, -0.00429309,  0.00350596,
       -0.00705958, -0.00544115, -0.00185358,  0.00993089, -0.00651131,
       -0.00834236, -0.00393341, -0.00191398, -0.0014376 ,  0.00206255,
       -0.00547027, -0.00668457,  0.00234277,  0.00264166, -0.00700368,
       -0.00044164,  0.00471694, -0.00593301, -0.00093206, -0.00882479,
        0.00597474, -0.00671967,  0.00854737,  0.00911713, -0.00578616,
       -0.00451044,  0.00608086,  0.00760688, -0.0044637 ,  0.00612483,
        0.00750832,  0.00528153, -0.00361242, -0.00058244, -0.00948951,
        0.00197906,  0.00538405,  0.00370634, -0.00084229, -0.00413783],
      dtype=float32)

In [45]:
len(feature_array_word2vec)

13005

In [46]:
len(feature_array_word2vec[0])

50

# Dimensionality Reduction for Sparse Matricies

Feature Reduction for the unigram, bigram, and tf-idf sparse matricies

In [47]:
from sklearn.decomposition import TruncatedSVD

In [48]:
def reduceDim(sparse, n):
    tsvd = TruncatedSVD(n_components = n)
    return tsvd.fit(sparse).transform(sparse)

In [49]:
reduced_unigram = reduceDim(feature_matrix_unigram, 200)

In [50]:
reduced_bigram = reduceDim(feature_matrix_bigram, 500)

In [51]:
reduced_tfidf = reduceDim(feature_matrix_tfidf, 200)

#### Comparison of Original Sparse Matricies to Reduced Matricies

In [52]:
feature_matrix_unigram

<13005x13382 sparse matrix of type '<class 'numpy.int64'>'
	with 96827 stored elements in Compressed Sparse Row format>

In [53]:
len(reduced_unigram), len(reduced_unigram[0])

(13005, 200)

In [54]:
feature_matrix_bigram

<13005x79813 sparse matrix of type '<class 'numpy.int64'>'
	with 183898 stored elements in Compressed Sparse Row format>

In [55]:
len(reduced_bigram), len(reduced_bigram[0])

(13005, 500)

In [56]:
feature_matrix_tfidf

<13005x13382 sparse matrix of type '<class 'numpy.float64'>'
	with 96827 stored elements in Compressed Sparse Row format>

In [57]:
len(reduced_tfidf), len(reduced_tfidf[0])

(13005, 200)

# Saving NumPy Arrays

Original Arrays:

    feature_array_unigram
    
    feature_array_bigram
    
    feature_array_tfidf
    
    feature_array_word2vec

Reduced Arrays:

    reduced_unigram
    
    reduced_bigram
    
    reduced_tfidf

In [58]:
original_arrays = [feature_array_unigram, feature_array_bigram, feature_array_tfidf, feature_array_word2vec]
reduced_arrays = [reduced_unigram, reduced_bigram, reduced_tfidf]

original_file_names = ["feature_array_unigram", "feature_array_bigram", "feature_array_tfidf", "feature_array_word2vec"]
reduced_file_names = ["reduced_unigram", "reduced_bigram", "reduced_tfidf"]

In [59]:
os.mkdir('nlp_data')
def saveFile(file, array):
    filename = 'nlp_data/' + file + ".npy"
    np.save(filename, array)

In [60]:
for i in range(0, 4):
    saveFile(original_file_names[i], original_arrays[i])
for i in range(0, 3):
    saveFile(reduced_file_names[i], reduced_arrays[i])