In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
def loadCSV(filename):
    file = filename
    if '.csv' not in filename:
        file += '.csv'
    data = pd.read_csv(file, encoding = 'ISO-8859-1')
    return data

In [3]:
csvFile = 'raw_data_hate_speech'

In [4]:
data = loadCSV(csvFile)
data

Unnamed: 0,does_this_tweet_contain_hate_speech,does_this_tweet_contain_hate_speech:confidence,tweet_text
0,The tweet uses offensive language but not hate...,0.6013,Warning: penny boards will make you a faggot
1,The tweet contains hate speech,0.7227,Fuck dykes
2,The tweet contains hate speech,0.5229,@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
3,The tweet contains hate speech,0.5184,"""@jayswaggkillah: ""@JacklynAnnn: @jayswaggkill..."
4,The tweet uses offensive language but not hate...,0.5185,@Zhugstubble You heard me bitch but any way I'...
...,...,...,...
14504,The tweet is not offensive,0.3418,I'm sorry. Did I offend your white supremacist...
14505,The tweet is not offensive,0.6804,@tradethecycles Caucasian euro aryan whatever....
14506,The tweet is not offensive,1.0000,@yadavakhilesh sir a patient named aryan khan ...
14507,The tweet is not offensive,1.0000,@Iamshivachari Happy birthday bro _Ì«ÌÐ_ \nHa...


# Check for Null Values

Determine if there are any missing or null values in each column.

In [5]:
data[data["does_this_tweet_contain_hate_speech"].isnull()]

Unnamed: 0,does_this_tweet_contain_hate_speech,does_this_tweet_contain_hate_speech:confidence,tweet_text


In [6]:
data[data["does_this_tweet_contain_hate_speech:confidence"].isnull()]

Unnamed: 0,does_this_tweet_contain_hate_speech,does_this_tweet_contain_hate_speech:confidence,tweet_text


In [7]:
data[data["tweet_text"].isnull()]

Unnamed: 0,does_this_tweet_contain_hate_speech,does_this_tweet_contain_hate_speech:confidence,tweet_text


Since the above datatables contain no information, the provided hate speech csv does not have any missing or null values.

# Raw CSV Changes

#### Change Column Names

does_this_tweet_contain_hate_speech              ===> tweet_class

does_this_tweet_contain_hate_speech:confidence   ===> confidence

tweet_text                                       ===> tweet_text

In [8]:
data = data.rename(columns = {'does_this_tweet_contain_hate_speech':'tweet_class',
                              'does_this_tweet_contain_hate_speech:confidence':'confidence'})
data.head()

Unnamed: 0,tweet_class,confidence,tweet_text
0,The tweet uses offensive language but not hate...,0.6013,Warning: penny boards will make you a faggot
1,The tweet contains hate speech,0.7227,Fuck dykes
2,The tweet contains hate speech,0.5229,@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
3,The tweet contains hate speech,0.5184,"""@jayswaggkillah: ""@JacklynAnnn: @jayswaggkill..."
4,The tweet uses offensive language but not hate...,0.5185,@Zhugstubble You heard me bitch but any way I'...


#### Change 'tweet_class' to Numerical Values

0 = The tweet is not offensive

1 = The tweet uses offensive language but not hate speech

2 = The tweet contains hate speech

In [9]:
originalText = ['The tweet is not offensive',
                'The tweet uses offensive language but not hate speech',
                'The tweet contains hate speech']

numericalClass = [0, 1, 2]

In [10]:
data['tweet_class'] = data['tweet_class'].replace(originalText, numericalClass)

In [11]:
data.head()

Unnamed: 0,tweet_class,confidence,tweet_text
0,1,0.6013,Warning: penny boards will make you a faggot
1,2,0.7227,Fuck dykes
2,2,0.5229,@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
3,2,0.5184,"""@jayswaggkillah: ""@JacklynAnnn: @jayswaggkill..."
4,1,0.5185,@Zhugstubble You heard me bitch but any way I'...


# Determine Data Size

In [12]:
all_tweets = len(data['tweet_class'])
neutral_tweets = len(data[data['tweet_class'] == 0])
offensive_tweets = len(data[data['tweet_class'] == 1])
hate_tweets = len(data[data['tweet_class'] == 2])

In [13]:
all_tweets, neutral_tweets, offensive_tweets, hate_tweets

(14509, 7274, 4836, 2399)

# Save Altered CSV

In [14]:
path = os.getcwd()

In [15]:
new_csv_filename = "/altered_hate_speech.csv"
filepath = path + new_csv_filename

In [16]:
data = data[['tweet_text', 'confidence', 'tweet_class']]
data.head()

Unnamed: 0,tweet_text,confidence,tweet_class
0,Warning: penny boards will make you a faggot,0.6013,1
1,Fuck dykes,0.7227,2
2,@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...,0.5229,2
3,"""@jayswaggkillah: ""@JacklynAnnn: @jayswaggkill...",0.5184,2
4,@Zhugstubble You heard me bitch but any way I'...,0.5185,1


In [17]:
data.to_csv(filepath, index = None, header = True)