In [1]:
import os
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Import Datasets

In [2]:
def loadCSV(filename):
    file = filename
    if '.csv' not in filename:
        file += '.csv'
    data = pd.read_csv(file, encoding = 'ISO-8859-1')
    return data

In [3]:
data_1 = loadCSV('tweetset_1')
data_2 = loadCSV('tweetset_2')

In [4]:
data_1.head()

Unnamed: 0,does_this_tweet_contain_hate_speech,does_this_tweet_contain_hate_speech:confidence,tweet_text
0,The tweet uses offensive language but not hate...,0.6013,Warning: penny boards will make you a faggot
1,The tweet contains hate speech,0.7227,Fuck dykes
2,The tweet contains hate speech,0.5229,@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
3,The tweet contains hate speech,0.5184,"""@jayswaggkillah: ""@JacklynAnnn: @jayswaggkill..."
4,The tweet uses offensive language but not hate...,0.5185,@Zhugstubble You heard me bitch but any way I'...


In [5]:
data_2.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


# Size of Datasets

In [6]:
originalClass = ['The tweet is not offensive',
                 'The tweet uses offensive language but not hate speech',
                 'The tweet contains hate speech']

newClass = [0, 1, 2]

data_1['does_this_tweet_contain_hate_speech'] = data_1['does_this_tweet_contain_hate_speech'].replace(originalClass, newClass)

In [7]:
print('Size of Dataset 1: %s' % len(data_1['does_this_tweet_contain_hate_speech']))
print('Size of Dataset 2: %s' % len(data_2['class']))

Size of Dataset 1: 14509
Size of Dataset 2: 24783


In [8]:
print("Neutral Tweets in Dataset 1: %s" % len(data_1[data_1['does_this_tweet_contain_hate_speech'] == 0])) 
print("Neutral Tweets in Dataset 2: %s" % len(data_2[data_2['class'] == 2]))

Neutral Tweets in Dataset 1: 7274
Neutral Tweets in Dataset 2: 4163


In [9]:
print("Offensive Tweets in Dataset 1: %s" % len(data_1[data_1['does_this_tweet_contain_hate_speech'] == 1]))
print("Offensive Tweets in Dataset 2: %s" % len(data_2[data_2['class'] == 1]))

Offensive Tweets in Dataset 1: 4836
Offensive Tweets in Dataset 2: 19190


In [10]:
print("Hate Tweets in Dataset 1: %s" % len(data_1[data_1['does_this_tweet_contain_hate_speech'] == 2]))
print("Hate Tweets in Dataset 2: %s" % len(data_2[data_2['class'] == 0]))

Hate Tweets in Dataset 1: 2399
Hate Tweets in Dataset 2: 1430


In [11]:
print("Hate/Offensive Tweets in Dataset 1: %s" % len(data_1[data_1['does_this_tweet_contain_hate_speech'] != 0]))
print("Hate/Offensive Tweets in Dataset 2: %s" % len(data_2[data_2['class'] != 2]))

Hate/Offensive Tweets in Dataset 1: 7235
Hate/Offensive Tweets in Dataset 2: 20620


# Clean up 1st Twitter Dataset

In [12]:
data_1 = data_1.rename(columns = {'does_this_tweet_contain_hate_speech':'class',
                                  'does_this_tweet_contain_hate_speech:confidence':'confidence',
                                  'tweet_text':'tweet'})

In [13]:
originalClass = [0, 1, 2]
newClass = [0, 1, 1]

In [14]:
data_1['class'] = data_1['class'].replace(originalClass, newClass)

In [15]:
data_1 = data_1[['tweet', 'class']]

In [16]:
data_1.head()

Unnamed: 0,tweet,class
0,Warning: penny boards will make you a faggot,1
1,Fuck dykes,1
2,@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...,1
3,"""@jayswaggkillah: ""@JacklynAnnn: @jayswaggkill...",1
4,@Zhugstubble You heard me bitch but any way I'...,1


# Clean up 2nd Twitter Dataset

In [17]:
originalClass = [0, 1, 2]
newClass = [1, 1, 0]

In [18]:
data_2['class'] = data_2['class'].replace(originalClass, newClass)

In [19]:
data_2 = data_2[['tweet', 'class']]

In [20]:
data_2.head()

Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,0
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1


# Define Text Processing Functions

In [21]:
validCharacters = 'abcdefghijklmnopqrstuvwxyz@'
stop_words = set(stopwords.words('english'))
slang = ['rt', 'u','ur','urs','urself','urselves','r','y','cus','cuz','bc','w','thru','n',]
contractions = ['id','ill','ive','im','theyre','theyve','weve','itll','thats','theres',
                'lets','cant','dont','didnt','arent','isnt','wont','whos']
ps = PorterStemmer()

In [22]:
def makeLowercase(data):
    data['tweet'] = data['tweet'].apply(lambda x: x.lower())

In [23]:
def makeValid(word):
    return ''.join(filter(lambda x: x in validCharacters, word))

In [24]:
def is_stopword(word):
    return word in stop_words or word in slang or word in contractions

In [25]:
def stemTweet(word):
    return ps.stem(word)

In [26]:
def standardizeTweets(data):    
    for row in range(0, len(data['tweet'])):
        
        original_tweet = data['tweet'][row]
        words = original_tweet.split()
        filtered = []
        
        for i in range(0, len(words)):
            if '@' in words[i]:
                filtered.append('[@]')
            elif 'http' in words[i]:
                filtered.append('[LINK]')
            else:
                validWord = makeValid(words[i])
                if not is_stopword(validWord):
                    filtered.append(stemTweet(validWord))
        
        filtered_tweet = ' '.join(filtered)
        
        data['tweet'] = data['tweet'].replace(original_tweet, filtered_tweet)

In [27]:
def processDatasets(data):
    makeLowercase(data)
    standardizeTweets(data)

# Process Twitter Datasets

In [28]:
processDatasets(data_1)

In [29]:
processDatasets(data_2)

# Remove Empty/Duplicate Tweets in 1st Twitter Dataset

In [30]:
original = len(data_1)

In [31]:
data_1 = data_1[data_1['tweet'] != '']
empty_removed = len(data_1)

In [32]:
data_1 = data_1.drop_duplicates(subset=['tweet', 'class'])
duplicates_removed = len(data_1)

In [33]:
print("Size of 1st Dataset: %s" % original)
print("Size of 1st Dataset with Empty Tweets Removed: %s" % empty_removed)
print("Size of 1st Dataset with Duplicate Tweets Removed: %s" % duplicates_removed)

Size of 1st Dataset: 14509
Size of 1st Dataset with Empty Tweets Removed: 14508
Size of 1st Dataset with Duplicate Tweets Removed: 12830


# Remove Empty/Duplicate Tweets in 2nd Twitter Dataset

In [34]:
original = len(data_2)

In [35]:
data_2 = data_2[data_2['tweet'] != '']
empty_removed = len(data_2)

In [36]:
data_2 = data_2.drop_duplicates(subset=['tweet', 'class'])
duplicates_removed = len(data_2)

In [37]:
print("Size of 2nd Dataset: %s" % original)
print("Size of 2nd Dataset with Empty Tweets Removed: %s" % empty_removed)
print("Size of 2nd Dataset with Duplicate Tweets Removed: %s" % duplicates_removed)

Size of 2nd Dataset: 24783
Size of 2nd Dataset with Empty Tweets Removed: 24783
Size of 2nd Dataset with Duplicate Tweets Removed: 24185


# 1st Twitter Dataset - Processed

In [38]:
data_1.head()

Unnamed: 0,tweet,class
0,warn penni board make faggot,1
1,fuck dyke,1
2,[@] [@] [@] [@] [@] least look like jefre star...,1
3,[@] [@] [@] fag jacki jealou neeeee,1
4,[@] heard bitch way back th texa wtf talk bitc...,1


In [39]:
data_1.tail()

Unnamed: 0,tweet,class
14504,sorri offend white supremacist aryan nation ne...,0
14505,[@] caucasian euro aryan whatev realli doesnt ...,0
14506,[@] sir patient name aryan khan villag meeranp...,0
14507,[@] happi birthday bro happi year ahead,0
14508,[@] aryan kapoor cute name tho want kamp firs...,0


# 2nd Twitter Dataset - Processed

In [40]:
data_2.head()

Unnamed: 0,tweet,class
0,[@] woman shouldnt complain clean hous amp ma...,0
1,[@] boy dat coldtyga dwn bad cuffin dat hoe s...,1
2,[@] dawg [@] ever fuck bitch start cri confus...,1
3,[@] [@] look like tranni,1
4,[@] shit hear might true might faker bitch to...,1


In [41]:
data_2.tail()

Unnamed: 0,tweet,class
24778,you muthafin lie [@] [@] [@] right tl trash m...,1
24779,youv gone broke wrong heart babi drove redneck...,0
24780,young buck wanna eat dat nigguh like aint fuck...,1
24781,youu got wild bitch tellin lie,1
24782,ruffl ntac eileen dahlia beauti color combin...,0


# Size of Processed Datasets

In [42]:
print('Size of Dataset 1: %s' % len(data_1['class']))
print('Size of Dataset 2: %s' % len(data_2['class']))

Size of Dataset 1: 12830
Size of Dataset 2: 24185


In [43]:
print("Neutral Tweets in Dataset 1: %s" % len(data_1[data_1['class'] == 0])) 
print("Neutral Tweets in Dataset 1: %s" % len(data_2[data_2['class'] == 0]))

Neutral Tweets in Dataset 1: 6691
Neutral Tweets in Dataset 1: 4092


In [44]:
print("Hate/Offensive Tweets in Dataset 1: %s" % len(data_1[data_1['class'] == 1]))
print("Hate/Offensive Tweets in Dataset 1: %s" % len(data_2[data_2['class'] == 1]))

Hate/Offensive Tweets in Dataset 1: 6139
Hate/Offensive Tweets in Dataset 1: 20093


# Save Processed Datasets

In [45]:
path = os.getcwd()

In [46]:
d1 = path + '/processed_1.csv'
d2 = path + '/processed_2.csv'

In [47]:
data_1.to_csv(d1, index = None, header = True)
data_2.to_csv(d2, index = None, header = True)