In [1]:
import pandas as pd
import numpy as np

In [2]:
# read the file and inport the data in to a dataframe
raw_tweet_df = pd.read_csv('HOT_dataset_modified.csv',index_col=None, header=None, engine='python')
raw_tweet_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.0,@saud5683 @Mutayyab420 @shivang598 @Ranask35 @...,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,
2,2.0,"Banti hai empowered woman, feminism pe gyan pe...",,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,
4,2.0,RT @kim_jong_korea: @updatingwait @Acutereply ...,,,,,,,,,,,,,,,,,


In [3]:
raw_tweet_df.shape

(6379, 19)

In [4]:
#taking only the first 2 columns of the data
tweet_df = pd.DataFrame(raw_tweet_df, columns=[0, 1])

In [5]:
# renaming the columns 
tweet_df = tweet_df.rename(index=str, columns={0: 'score', 1: 'text'})
tweet_df.head()

Unnamed: 0,score,text
0,0.0,@saud5683 @Mutayyab420 @shivang598 @Ranask35 @...
1,,
2,2.0,"Banti hai empowered woman, feminism pe gyan pe..."
3,,
4,2.0,RT @kim_jong_korea: @updatingwait @Acutereply ...


In [6]:
tweet_df.dropna(inplace = True)

# Resetting index on the dataframe
tweet_df.reset_index(inplace = True, drop = True)
tweet_df.shape

(3189, 2)

In [7]:
# remove the usernames from tweets
import re

remove_usr_pattern = r'@[\w]+'
tweet_df.replace(to_replace = remove_usr_pattern, value = "", inplace = True, regex = True)
tweet_df.head()

Unnamed: 0,score,text
0,0.0,Haa jaise tum bhi abhi p\xe2\x80\xa6 h...
1,2.0,"Banti hai empowered woman, feminism pe gyan pe..."
2,2.0,RT : Ab usko chhod mjse bat kr tera baap aa ...
3,2.0,"Punjab in madarchodon ko Khila raha hai, no..."
4,2.0,RT : Agar koi bole ki ja ke chill maar to mada...


In [8]:
# remove any links present in the tweets

url_links_pattern = r'https?://[A-Za-z0-9./]+'
tweet_df.replace(to_replace = url_links_pattern, value = "", inplace = True, regex = True)
tweet_df

Unnamed: 0,score,text
0,0.0,Haa jaise tum bhi abhi p\xe2\x80\xa6
1,2.0,"Banti hai empowered woman, feminism pe gyan pe..."
2,2.0,RT : Ab usko chhod mjse bat kr tera baap aa ...
3,2.0,"Punjab in madarchodon ko Khila raha hai, no..."
4,2.0,RT : Agar koi bole ki ja ke chill maar to mada...
...,...,...
3184,1.0,Ye dekha tera islamic kashmiri.jihaadi su...
3185,1.0,all love jihaadi should ne killed aise hi
3186,1.0,"Ye dekh madarjaat, rand ki aullad tera is..."
3187,1.0,Waise jihaadi kutte ki double maut aane ...


In [9]:
# replacing '\n' in data with a space

tweet_df.replace(to_replace = r'\\n', value = ' ', inplace = True, regex = True)

In [10]:
# smileys in the tweeets are represented in a format like \x6\xf.....
# using the pattern to remove these smiley representations

tweet_df.replace(to_replace = r'\\[\w]+', value = ' ', inplace = True, regex = True)

In [11]:
# removing numbers, puntuations.. only alphabets remain in the tweet text.

tweet_df.replace(to_replace = r'[^a-zA-Z]',value = ' ', inplace = True, regex = True)


In [12]:
# replacing multiple spaces together with a single space

tweet_df.replace(to_replace = r'\s+',value = ' ', inplace = True, regex = True)

In [13]:
# delete the rows which have only a space ' ' in their text. data cleaning steps removed everything from those tweets.
remove_rows_index = []
for idx in tweet_df.index:
    if tweet_df['text'][idx]== ' ':
        remove_rows_index.append(int(idx))
        
tweet_df.drop(tweet_df.index[remove_rows_index], inplace = True)

tweet_df.reset_index(inplace = True, drop = True)

tweet_df.shape
        

(3085, 2)

In [14]:
# making a list of the stopwords in hinglish
file_path = 'stopwords_hinglish.txt'
with open(file_path, 'r') as f:
    line = f.readlines()
    stopwords_hinglish = [word.strip() for word in line ]
    
# I also appended some words in the stopword list which I felt did not add any value to the text analysis

In [15]:
# tokenize the tweets and remove stop words
from nltk.tokenize import WordPunctTokenizer

tok = WordPunctTokenizer()
tweets = tweet_df.text.copy()
score = tweet_df.score.copy()

clean_tweets = []

# Remove stopwords and  keep all tweets in a new list

for t in tweets:
    lower_case = t.lower()
    tokens = tok.tokenize(lower_case)
    words = []
    for token in tokens:
        if token in stopwords_hinglish:
            pass
        else:
            words.append(token)
    
    clean_tweets.append((" ".join(words)).strip())

In [16]:
# create a new data frame with clean tweets

clean_tweets_df = pd.DataFrame(clean_tweets, columns=['text'])
clean_tweets_df['score'] = score
clean_tweets_df


Unnamed: 0,text,score
0,haa,0.0
1,banti empowered woman feminism gyan pelti din ...,2.0
2,usko chhod mjse bat baap gya ldki beech madarchod,2.0
3,punjab madarchodon khila nokrian day imran,2.0
4,chill maar madarchod gand maar lene,2.0
...,...,...
3080,islamic kashmiri jihaadi suar jisk,1.0
3081,love jihaadi killed,1.0
3082,madarjaat rand aullad islamic jihaadi,1.0
3083,jihaadi kutte double maut aane,1.0


In [17]:
# converting the score (label) of each tweet to integer from float 
clean_tweets_df['score'] = clean_tweets_df['score'].apply(np.int64)


In [18]:
# to see the class distribution of the dataset
classCountDf = clean_tweets_df.groupby("score",as_index = False)["text"].count()
classCountDf

Unnamed: 0,score,text
0,0,1019
1,1,303
2,2,1763


In [19]:
# Checking the number of rows which are empty strings
cnt = 0
for idx in clean_tweets_df.index:
    if clean_tweets_df['text'][idx] == "":
        cnt+=1
print(cnt)
        

82


In [20]:
# dropping the empty rows from dataframe
empty_rows_index = []
for idx in clean_tweets_df.index:
    if clean_tweets_df['text'][idx] == "":
        empty_rows_index.append(int(idx))
        
clean_tweets_df.drop(clean_tweets_df.index[empty_rows_index], inplace = True)
clean_tweets_df.reset_index(inplace = True, drop = True)

In [21]:
clean_tweets_df.shape

(3003, 2)

In [22]:
# Checking the class distribution of tweets(0 - Benign, 1 - Hate inducing, 2 - Abusive)
classCountDf = clean_tweets_df.groupby("score",as_index = False)["text"].count()
classCountDf
        

Unnamed: 0,score,text
0,0,940
1,1,302
2,2,1761


In [23]:
# writing this file to save in the local folder
clean_tweets_df.to_csv('tweets_dataset.csv', index = False)

In [24]:
# Normalizing the spellings for some tokens, 
#words like aaaaarrrrrrrggggghhhh which have multiple repetitions 
# will be reduced to maximum of 2 characters 
clean_tweets_df.text.replace(r'(.)\1+',r'\1\1',inplace = True, regex=True)
clean_tweets_df

Unnamed: 0,text,score
0,haa,0
1,banti empowered woman feminism gyan pelti din ...,2
2,usko chhod mjse bat baap gya ldki beech madarchod,2
3,punjab madarchodon khila nokrian day imran,2
4,chill maar madarchod gand maar lene,2
...,...,...
2998,islamic kashmiri jihaadi suar jisk,1
2999,love jihaadi killed,1
3000,madarjaat rand aullad islamic jihaadi,1
3001,jihaadi kutte double maut aane,1


In [25]:
# To check the counts of unique words in the tweets
import nltk
from collections import Counter
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer("\w+\'?\w+|\w+")
words = []
for i in range(len(clean_tweets_df)):
    sentence = clean_tweets_df['text'][i]
    words.extend(tokenizer.tokenize(sentence))

print(' total words = {}'.format(len(words)))
words_unique = list(set(words))
print(len(words_unique))


counts = Counter(words)
print(counts)




 total words = 18840
6950


In [26]:
# separating the Hinglish words from English words
# words not in wordnet english words will be put in the hinglish words list, to be used for transliteration
import nltk
nltk.download('wordnet')
nltk.download('words')
from nltk.stem import WordNetLemmatizer 

hinglish_words = []
english_words = []

all_eng_words = set(nltk.corpus.words.words())

lemmatizer = WordNetLemmatizer() 
for word in words_unique:
    if lemmatizer.lemmatize(word) in all_eng_words:
        english_words.append(word)
    else:
        hinglish_words.append(word)



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [27]:
print(len(english_words))

print(len(hinglish_words))

2554
4396


In [28]:
# importing data which has all abusive words and its translation in English
profanity_df = pd.read_csv('Hinglish_Profanity_List.csv',engine = 'python',index_col = None, header = None)
profanity_df    

Unnamed: 0,0,1,2
0,badir,idiot,1
1,badirchand,idiot,1
2,bakland,idiot,1
3,bhadva,pimp,2
4,bhootnika,son of a witch,3
...,...,...,...
229,jihadi,terrorist,4
230,atankvadi,terrorist,4
231,atankwadi,terrorist,4
232,aatanki,terorist,4


In [29]:
# selecting only the hinglish word and its english translation
profanity_df.drop([2],axis=1,inplace = True)
profanity_df.rename(index=str, columns={0: 'hinglish', 1: 'english'},inplace = True)
profanity_df.head()

Unnamed: 0,hinglish,english
0,badir,idiot
1,badirchand,idiot
2,bakland,idiot
3,bhadva,pimp
4,bhootnika,son of a witch


In [30]:
# creating a dictionary from the dataframe of abusive words
profanity_dict = {}
for i in profanity_df.index:
    profanity_dict[profanity_df['hinglish'][i]] = profanity_df['english'][i]

hinglish_profanity_list = list(profanity_dict.keys())


In [31]:
# inporting the json file which has more Hinglish words and their english translation
import json


hinglish_translit_dict = {}
file_path = "dictionary_hinglish_transliteration.json"
with open(file_path) as f:
    hinglish_translit_dict = json.load(f)

hinglish_translit_list = list(hinglish_translit_dict.keys())

In [32]:
# Going through each word in tweets and finding the closest match for it in the profanity list. 
# Using this method instead of direct match of words, covers some of the spelling variations of abuse words 
import difflib

for w in hinglish_words:
    close_match = difflib.get_close_matches(w, hinglish_profanity_list , n=1, cutoff=0.9)
    if len(close_match) > 0:
        profanity_dict[w] = profanity_dict[close_match[0]]
    
print(profanity_dict)

{'badir': 'idiot', 'badirchand': 'idiot', 'bakland': 'idiot', 'bhadva': 'pimp', 'bhootnika': 'son of a witch', 'chinaal': 'prostitute', 'chup': 'shut up', 'chutia': 'fucker ', 'ghasti': 'hooker', 'chutiya': 'fucker', 'haraami': 'bastard', 'haraam': 'bastard', 'hijra': 'transsexual ', 'hinjda': 'transsexual ', 'jaanvar': 'animal', 'kutta': 'dog', 'kutiya': 'bitch', 'khota': 'donkey', 'auladheen': 'sonless', 'jaat': 'breed', 'najayaz': 'illegitimate', 'gandpaidaish': 'badborn', 'saala': 'sucker', 'kutti': 'bitch', 'soover': 'swine', 'tatti': 'shit', 'potty': 'shit', 'behnchodon': 'sister fuckers', 'behnchod': 'sister fucker', 'behenchod': 'sister fucker', 'behenchodd': 'sister fucker', 'bahenchod': 'sister fucker', 'bahanchod': 'sister fucker', 'bahencho': 'sister fucker', 'bancho': 'sister fucker', 'sali': 'bitch', 'bahenke': 'sister ', 'laude': 'dick', 'takke': 'balls', 'betichod': 'daughter fucker', 'bhaichod': 'brother fucker', 'bhains': 'buffalo', 'jhalla': 'faggot', 'jhant': 'pubic

In [33]:
# Going through each word in our tweets and replacing it with the translation of abusive words
# also using the hinglish dictionary imported from the transliteration json file
all_modified_sentences = []
for idx in range(len(clean_tweets_df)):
    sentence = clean_tweets_df['text'][idx]
    words = tokenizer.tokenize(sentence)
    new_words= []
    new_sentence =""
    for word in words: # if word foind in profanity list, use the translation
        if word in profanity_dict.keys():
            new_words.append(profanity_dict[word])
        elif word in hinglish_translit_dict.keys(): # if the word found in hinglish transliteration dictionary, use the translation
            new_words.append(hinglish_translit_dict[word])
        else:
            new_words.append(word)
    new_sentence = " ".join(new_words)
    all_modified_sentences.append(new_sentence)

clean_tweets_df['new_text'] = all_modified_sentences
clean_tweets_df

Unnamed: 0,text,score,new_text
0,haa,0,ha
1,banti empowered woman feminism gyan pelti din ...,2,banti empowered adult female feminism gyan pel...
2,usko chhod mjse bat baap gya ldki beech madarchod,2,usko chhod mjse words father gya ldki midway m...
3,punjab madarchodon khila nokrian day imran,2,punjab mother fucker khila nokrian day imran
4,chill maar madarchod gand maar lene,2,chill kill motherfucker ass kill lene
...,...,...,...
2998,islamic kashmiri jihaadi suar jisk,1,islamic kashmiri muslim terrorist pig jisk
2999,love jihaadi killed,1,love muslim terrorist killed
3000,madarjaat rand aullad islamic jihaadi,1,mother fucker whore aullad islamic muslim ter...
3001,jihaadi kutte double maut aane,1,muslim terrorist dog double end aane


In [35]:
# Saved the translated text into a new column to be used for analysis
clean_tweets_df.rename(columns={"text": "old_text", "new_text": "text"},inplace = True)
clean_tweets_df

Unnamed: 0,old_text,score,text
0,haa,0,ha
1,banti empowered woman feminism gyan pelti din ...,2,banti empowered adult female feminism gyan pel...
2,usko chhod mjse bat baap gya ldki beech madarchod,2,usko chhod mjse words father gya ldki midway m...
3,punjab madarchodon khila nokrian day imran,2,punjab mother fucker khila nokrian day imran
4,chill maar madarchod gand maar lene,2,chill kill motherfucker ass kill lene
...,...,...,...
2998,islamic kashmiri jihaadi suar jisk,1,islamic kashmiri muslim terrorist pig jisk
2999,love jihaadi killed,1,love muslim terrorist killed
3000,madarjaat rand aullad islamic jihaadi,1,mother fucker whore aullad islamic muslim ter...
3001,jihaadi kutte double maut aane,1,muslim terrorist dog double end aane


In [36]:
# saving the results into a csv file
# clean_tweets_df.to_csv('HOT_preprocessed_data.csv',index = None)