In [1]:
import numpy as np
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
def loadCSV(filename):
    file = filename
    if '.csv' not in filename:
        file += '.csv'
    data = pd.read_csv(file, encoding = 'ISO-8859-1')
    return data

In [3]:
csvFile = "processed_hate_speech"
data = loadCSV(csvFile)

In [4]:
data.head()

Unnamed: 0,tweet_text,confidence,tweet_class
0,warning penny boards will make you a faggot,0.6013,1
1,fuck dykes,0.7227,2
2,[@] [@] [@] [@] [@] at least i dont look like ...,0.5229,2
3,[@] [@] [@] is a fag jackie jealous neeeee,0.5184,2
4,[@] you heard me bitch but any way im back th ...,0.5185,1


In [5]:
data.tail()

Unnamed: 0,tweet_text,confidence,tweet_class
14504,im sorry did i offend your white supremacist a...,0.3418,0
14505,[@] caucasian euro aryan whatever really doesn...,0.6804,0
14506,[@] sir a patient named aryan khan village mee...,1.0,0
14507,[@] happy birthday bro have an happy year ahead,1.0,0
14508,[@] aryan kapoor is such a cute name tho d we...,1.0,0


# Removing Stop Words

In [6]:
stop_words = set(stopwords.words('english'))

In [7]:
slang_words = ['u','ur','urs','urself','urselves','r','y','cus','cuz','bc','w','thru','n',]

In [8]:
contraction_words = ['id','ill','ive','im','theyre','theyve','weve','itll','thats','theres','lets','cant','dont','didnt','arent','isnt','wont','whos']

In [9]:
for row in range(0, len(data["tweet_text"])):
    original_tweet = data["tweet_text"][row]
    words = original_tweet.split()
    filtered_words = []
    
    for i in range(0, len(words)):
        if words[i] not in stop_words and words[i] not in slang_words and words[i] not in contraction_words:
            filtered_words.append(words[i])
    
    filtered_tweet = " ".join(filtered_words)
    
    data["tweet_text"] = data["tweet_text"].replace(original_tweet, filtered_tweet)

In [10]:
data.head()

Unnamed: 0,tweet_text,confidence,tweet_class
0,warning penny boards make faggot,0.6013,1
1,fuck dykes,0.7227,2
2,[@] [@] [@] [@] [@] least look like jefree sta...,0.5229,2
3,[@] [@] [@] fag jackie jealous neeeee,0.5184,2
4,[@] heard bitch way back th texas wtf talking ...,0.5185,1


In [11]:
data.tail()

Unnamed: 0,tweet_text,confidence,tweet_class
14504,sorry offend white supremacist aryan nation ne...,0.3418,0
14505,[@] caucasian euro aryan whatever really doesn...,0.6804,0
14506,[@] sir patient named aryan khan village meera...,1.0,0
14507,[@] happy birthday bro happy year ahead,1.0,0
14508,[@] aryan kapoor cute name tho want kamps firs...,1.0,0


# Stemming Words

In [12]:
ps = PorterStemmer()

In [13]:
for row in range(0, len(data["tweet_text"])):
    original_tweet = data["tweet_text"][row]
    words = original_tweet.split()
    stemmed_words = []
    
    for i in range(0, len(words)):
        stemmed_words.append(ps.stem(words[i]))
    
    stemmed_tweet = " ".join(stemmed_words)
    
    data["tweet_text"] = data["tweet_text"].replace(original_tweet, stemmed_tweet)

# Filter out Empty and Duplicate Tweets

In [14]:
len(data)

14509

In [15]:
data = data[data["tweet_text"] != '']
len(data)

14508

In [16]:
data = data.drop_duplicates(subset=['tweet_text', 'tweet_class'])
len(data)

13074

In [17]:
data.head()

Unnamed: 0,tweet_text,confidence,tweet_class
0,warn penni board make faggot,0.6013,1
1,fuck dyke,0.7227,2
2,[@] [@] [@] [@] [@] least look like jefre star...,0.5229,2
3,[@] [@] [@] fag jacki jealou neeeee,0.5184,2
4,[@] heard bitch way back th texa wtf talk bitc...,0.5185,1


In [18]:
data.tail()

Unnamed: 0,tweet_text,confidence,tweet_class
14504,sorri offend white supremacist aryan nation ne...,0.3418,0
14505,[@] caucasian euro aryan whatev realli doesnt ...,0.6804,0
14506,[@] sir patient name aryan khan villag meeranp...,1.0,0
14507,[@] happi birthday bro happi year ahead,1.0,0
14508,[@] aryan kapoor cute name tho want kamp first...,1.0,0


# Calculating Frequency of Unique Words by Class

In [19]:
def iterateFreq(dictionary, word):
    if dictionary.get(word, False):
        dictionary[word] += 1
    else:
        dictionary[word] = 1

In [20]:
def wordFrequencies(dataframe):
    all_tweets = {}
    neutral = {}
    offensive = {}
    hate = {}
    
    for index, row in dataframe.iterrows():
        tweet = row[0]
        tweet_class = row[2]
        
        tweet_words = tweet.split()
        
        for word in tweet_words:
            if tweet_class == 0:
                iterateFreq(neutral, word)
            elif tweet_class == 1:
                iterateFreq(offensive, word)
            elif tweet_class == 2:
                iterateFreq(hate, word)
                
            iterateFreq(all_tweets, word)
    
    return all_tweets, neutral, offensive, hate

In [21]:
def dispWordFreq(dictionary):
    unique_words = sorted( ((v,k) for k, v in dictionary.items()), reverse = True)
    for v, k in unique_words:
        word_val = "%s: %d" % (k, v)
        print(word_val)

In [22]:
all_wf, neutral_wf, offensive_wf, hate_wf = wordFrequencies(data)

In [23]:
len(all_wf)

14019

In [24]:
len(neutral_wf)

10603

In [25]:
len(offensive_wf)

5902

In [26]:
len(hate_wf)

3549

# Save Altered CSV

In [27]:
path = os.getcwd()

In [28]:
new_csv = "/stemmed_hate_speech.csv"
filepath = path + new_csv

In [29]:
data.to_csv(filepath, index = None, header = True)