In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import WordPunctTokenizer

In [None]:
raw_tweet_df = pd.read_csv('HOT_dataset_modified.csv',index_col=None, header=None, engine='python')
raw_tweet_df.head()

In [None]:
raw_tweet_df.shape

In [None]:
tweet_df = pd.DataFrame(raw_tweet_df, columns=[0, 1])

In [None]:
tweet_df = tweet_df.rename(index=str, columns={0: 'score', 1: 'text'})
tweet_df.head()

In [None]:
tweet_df.dropna(inplace = True)
tweet_df.shape
tweet_df

In [None]:
# remove the usernames from tweets
import re

remove_usr_pattern = r'@[\w]+'
tweet_df.replace(to_replace = remove_usr_pattern, value = "", inplace = True, regex = True)
tweet_df.head()

In [None]:
# remove any links present in the tweets

url_links_pattern = r'https?://[A-Za-z0-9./]+'
tweet_df.replace(to_replace = url_links_pattern, value = "", inplace = True, regex = True)
tweet_df

In [None]:
# replacing '\n' in data with a space

tweet_df.replace(to_replace = r'\\n', value = ' ', inplace = True, regex = True)

In [None]:
# smileys in the tweeets are represented in a format like \x6\xf.....
# using the pattern to remove these smiley representations

tweet_df.replace(to_replace = r'\\[\w]+', value = ' ', inplace = True, regex = True)

In [None]:
# removing numbers, puntuations.. only alphabets remain in the tweet text.

tweet_df.replace(to_replace = r'[^a-zA-Z]',value = ' ', inplace = True, regex = True)
tweet_df

In [None]:
# replacing multiple spaces together with a single space

tweet_df.replace(to_replace = r'\s+',value = ' ', inplace = True, regex = True)

In [None]:
# delete the rows which have only a space ' ' in their text. data cleaning steps removed everything from those tweets.
remove_rows_index = []
for idx in tweet_df.index:
    if tweet_df['text'][idx]== ' ':
        remove_rows_index.append(idx)
        
tweet_df.drop(tweet_df.index[remove_rows_index], inplace = True)

tweet_df
        

In [None]:
tweet_df.reset_index(inplace = True, drop = True)
tweet_df

In [None]:
# making a list of the stopwords in hinglish
file_path = 'data/stopwords_hinglish.txt'
with open(file_path, 'r') as f:
    line = f.readlines()
    stopwords_hinglish = [word.strip() for word in line ]
    
# I also appended some words in the stopword list which I felt did not add any value to the text analysis

In [None]:
# tokenize the tweets and remove stop words

tok = WordPunctTokenizer()
tweets = tweet_df.text.copy()
score = tweet_df.score.copy()

clean_tweets = []
word_count = {}

for t in tweets:
    lower_case = t.lower()
    tokens = tok.tokenize(lower_case)
    words = []
    for token in tokens:
        if token in stopwords_hinglish:
            pass
        else:
            words.append(token)
    
    clean_tweets.append((" ".join(words)).strip())

In [None]:
# create a new data frame with clean tweets

clean_tweets_df = pd.DataFrame(clean_tweets, columns=['text'])
clean_tweets_df['score'] = score
clean_tweets_df


In [None]:
# converting the score (label) of each tweet to integer from float 
clean_tweets_df['score'] = clean_tweets_df['score'].apply(np.int64)


In [None]:
# Checking the class distribution of tweets(0 - Benign, 1 - Hate inducing, 2 - Abusive)
classCountDf = clean_tweets_df.groupby("score",as_index = False)["text"].count()
classCountDf

In [None]:
# Checking the number of rows which are empty strings
cnt = 0
for idx in clean_tweets_df.index:
    if clean_tweets_df['text'][idx] == "":
        cnt+=1
print(cnt)
        

In [None]:
# dropping the empty rows from dataframe
empty_rows_index = []
for idx in clean_tweets_df.index:
    if clean_tweets_df['text'][idx] == "":
        empty_rows_index.append(idx)
        
clean_tweets_df.drop(clean_tweets_df.index[empty_rows_index], inplace = True)

In [None]:
clean_tweets_df.reset_index(inplace = True, drop = True)

In [None]:
clean_tweets_df.shape

In [None]:
# Check the class distribution of the the data
classCountDf = clean_tweets_df.groupby("score",as_index = False)["text"].count()
classCountDf
        

In [None]:
clean_tweets_df.to_csv('tweets_dataset.csv', index = False)

In [None]:
clean_tweets_df