## Import modules

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
import warnings
%matplotlib inline

warnings.filterwarnings('ignore')

## Loading the dataset

In [23]:
df = pd.read_csv('sinhala_offensive_language.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,0,1483838381482131459,1483836162645110784,2022-01-19 21:56:53 Sri Lanka Standard Time,2022-01-19,21:56:53,530,1149343869907030017,sawhowy,Toby ප !!,...,,,,,,"[{'screen_name': 'luvisplace', 'name': '222', ...",,,,
1,1,1483837109202944001,1483837109202944000,2022-01-19 21:51:49 Sri Lanka Standard Time,2022-01-19,21:51:49,530,1267256084848349185,frutitaverde,ᴀɴ'ප,...,,,,,,[],,,,
2,2,1483834311841173506,1483834311841173504,2022-01-19 21:40:42 Sri Lanka Standard Time,2022-01-19,21:40:42,530,1267256084848349185,frutitaverde,ᴀɴ'ප,...,,,,,,[],,,,
3,3,1483834011134803969,1483833223482388480,2022-01-19 21:39:31 Sri Lanka Standard Time,2022-01-19,21:39:31,530,1267256084848349185,frutitaverde,ᴀɴ'ප,...,,,,,,"[{'screen_name': 'SMINKHOZ', 'name': 'gaby yol...",,,,
4,4,1483833442575872004,1483833223482388480,2022-01-19 21:37:15 Sri Lanka Standard Time,2022-01-19,21:37:15,530,1267256084848349185,frutitaverde,ᴀɴ'ප,...,,,,,,"[{'screen_name': 'SMINKHOZ', 'name': 'gaby yol...",,,,


In [24]:
# datatype info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       20 non-null     int64  
 1   id               20 non-null     int64  
 2   conversation_id  20 non-null     int64  
 3   created_at       20 non-null     object 
 4   date             20 non-null     object 
 5   time             20 non-null     object 
 6   timezone         20 non-null     int64  
 7   user_id          20 non-null     int64  
 8   username         20 non-null     object 
 9   name             20 non-null     object 
 10  place            0 non-null      float64
 11  tweet            20 non-null     object 
 12  language         20 non-null     object 
 13  mentions         20 non-null     object 
 14  urls             20 non-null     object 
 15  photos           20 non-null     object 
 16  replies_count    20 non-null     int64  
 17  retweets_count   2

In [25]:
df = df[['name']]
df.head()

Unnamed: 0,name
0,Toby ප !!
1,ᴀɴ'ප
2,ᴀɴ'ප
3,ᴀɴ'ප
4,ᴀɴ'ප


## Preprocessing the dataset

In [26]:
# removes pattern in the input text
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt

In [27]:
df.head()

Unnamed: 0,name
0,Toby ප !!
1,ᴀɴ'ප
2,ᴀɴ'ප
3,ᴀɴ'ප
4,ᴀɴ'ප


In [28]:
# remove twitter handles (@user)
df['clean_word'] = np.vectorize(remove_pattern)(df['name'], "@[\w]*")

In [20]:
df.head()

Unnamed: 0,name,clean_word
0,Toby ප !!,Toby ප !!
1,ᴀɴ'ප,ᴀɴ'ප
2,ᴀɴ'ප,ᴀɴ'ප
3,ᴀɴ'ප,ᴀɴ'ප
4,ᴀɴ'ප,ᴀɴ'ප


In [29]:
# remove special characters, numbers and punctuations
# df['clean_word'] = df['clean_word'].str.replace("[^a-zA-Z#]", " ")
# df.head()

In [30]:
# remove short words
df['clean_word'] = df['clean_word'].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))
df.head()

Unnamed: 0,name,clean_word
0,Toby ප !!,Toby
1,ᴀɴ'ප,ᴀɴ'ප
2,ᴀɴ'ප,ᴀɴ'ප
3,ᴀɴ'ප,ᴀɴ'ප
4,ᴀɴ'ප,ᴀɴ'ප


In [31]:
# individual words considered as tokens
tokenized_tweet = df['clean_word'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [Toby]
1    [ᴀɴ'ප]
2    [ᴀɴ'ප]
3    [ᴀɴ'ප]
4    [ᴀɴ'ප]
Name: clean_word, dtype: object

In [None]:
# stem the words
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])
tokenized_tweet.head()

In [33]:
# combine words into single sentence
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = " ".join(tokenized_tweet[i])
    
df['clean_word'] = tokenized_tweet
df.head()

Unnamed: 0,name,clean_word
0,Toby ප !!,Toby
1,ᴀɴ'ප,ᴀɴ'ප
2,ᴀɴ'ප,ᴀɴ'ප
3,ᴀɴ'ප,ᴀɴ'ප
4,ᴀɴ'ප,ᴀɴ'ප


## Exploratory Data Analysis

In [None]:
!pip install wordcloud

In [None]:
# visualize the frequent words
all_words = " ".join([sentence for sentence in df['clean_word']])

from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)

# plot the graph
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()