In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
df = pd.read_csv('../hsle/data/exportcomments-outputs/20200323_20200325/processed/merged.csv')
print(df.shape)
df.head(2)

(58725, 15)


Unnamed: 0,cId,nId,Name (click to view profile),Profile ID,Date,Likes,Comment,(view source),postId,nChars,MsgUni,atleast1MM,MsgUniSeg,LexFound,PostURL
0,1,0,Nunu,ID: 100004238449395,2020-03-24 13:01:38,1,အမေစုသက်ရှည်ကျန်းမာပါစေ🙏,view comment,25032020_0072,24,အမေစုသက်ရှည်ကျန်းမာပါစေ🙏,True,အမေ စု သက် ရှည် ကျန်းမာ ပါစေ 🙏,,https://www.facebook.com/7daynews/posts/315261...
1,3,0,Nyi Nyi,ID: 100046335742147,2020-03-24 13:02:29,2,Educating people is essential.. Teach people t...,view comment,25032020_0072,289,Educating people is essential.. Teach people t...,False,Educatingpeopleisessential..Teachpeopletodofre...,,https://www.facebook.com/7daynews/posts/315261...


# Update LexFound

In [3]:
cols = ['Date','LexFound']

df = df.loc[~df.LexFound.isna(), cols]
print(df.shape)
df.head(2)

(1797, 2)


Unnamed: 0,Date,LexFound
4,2020-03-24 13:14:18,စစ်ခွေး
8,2020-03-24 13:50:46,စစ်ခွေး~ဖင်ယား


In [4]:
df['Date'] = [pd.to_datetime(d) for d in df.Date]
df.head(2)

Unnamed: 0,Date,LexFound
4,2020-03-24 13:14:18,စစ်ခွေး
8,2020-03-24 13:50:46,စစ်ခွေး~ဖင်ယား


## Split the `LexFound` column for individual lexicons

In [5]:
datetime_sr = df.Date
lex_sr = df.LexFound.apply(lambda x: x.split('~'))
lex_sr

4                [စစ်ခွေး]
8        [စစ်ခွေး, ဖင်ယား]
11               [စစ်ခွေး]
31               [စစ်ခွေး]
35               [စစ်ခွေး]
               ...        
58276             [သူပုန်]
58278             [သူပုန်]
58280          [အကြမ်းဖက်]
58286             [သူပုန်]
58289             [သူပုန်]
Name: LexFound, Length: 1797, dtype: object

In [6]:
dtflat, lexflat = [], []
hours = []
for dt,lex in zip(datetime_sr, lex_sr):
    for l in lex:
        dtflat.append(dt)
        hours.append(dt.hour)
        lexflat.append(l)

lex_time = pd.DataFrame({
    'Hate Speech Phrase': lexflat,
    'DateTime': dtflat,
    'Hour': hours
})
lex_time['Date'] = [pd.datetime(d.year, d.month, d.day) for d in lex_time.DateTime]

In [7]:
lex_time.head(2)

Unnamed: 0,Hate Speech Phrase,DateTime,Hour,Date
0,စစ်ခွေး,2020-03-24 13:14:18,13,2020-03-24
1,စစ်ခွေး,2020-03-24 13:50:46,13,2020-03-24


In [8]:
lex_time.to_csv('res/lex-time.csv',index=False)

In [9]:
pv = pd.pivot_table(
    lex_time, values='DateTime', index=['Hate Speech Phrase'], columns=['Date'], aggfunc=lambda x: len(x)).fillna(0).astype(int)
pv['Total'] = pv.sum(axis=1)
pv = pv.sort_values('Total', ascending=False)
display(pv.head())
pv.to_csv('res/lex-daily.csv')

Date,2020-03-23 00:00:00,2020-03-24 00:00:00,2020-03-25 00:00:00,Total
Hate Speech Phrase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
အကြမ်းဖက်,140,464,76,680
သူပုန်,86,174,50,310
စစ်ခွေး,75,110,32,217
သူခိုး,36,109,11,156
ဗမာတွေ,41,97,7,145


In [10]:
lex_time.groupby(['Hate Speech Phrase', 'Date'])['Date'].count().to_csv('res/lex-group.csv')

## Topics

In [16]:
# import sys
# sys.path.append('/home/bupi/Documents/pdy/packages/MyanmarNLPTools')
from MMCleaner import MMCleaner
cln = MMCleaner()

dx = pd.read_csv('../hsle/data/lexicon_d.csv')

tmp = dx.dropna(subset=['label','type'])

tmp['label'] = [cln.web_clean(l) for l in tmp.label]
tmp['type'] = [cln.web_clean(l) for l in tmp.type]

type_dict = {k:v for k,v in zip(tmp.label, tmp.type)}
tmp = dx.dropna(subset=['label_alternative_spelling'])
tmp['label_alternative_spelling'] = [
    cln.web_clean(l) for l in tmp.label_alternative_spelling]
type_dict.update({
    k:v for k,v in zip(tmp.label_alternative_spelling, tmp.type)})
type_dict.update({'nan':'nan'})

In [17]:
df['topics_found'] = [[type_dict[a] for a in str(l).split('~')] for l in df.LexFound]

In [18]:
df.head()

Unnamed: 0,Date,LexFound,topics_found
4,2020-03-24 13:14:18,စစ်ခွေး,[Ethnic Conflict]
8,2020-03-24 13:50:46,စစ်ခွေး~ဖင်ယား,"[Ethnic Conflict, Activism]"
11,2020-03-24 13:55:00,စစ်ခွေး,[Ethnic Conflict]
31,2020-03-24 13:15:59,စစ်ခွေး,[Ethnic Conflict]
35,2020-03-24 13:42:05,စစ်ခွေး,[Ethnic Conflict]


In [19]:
dtflat, lexflat = [], []
topicflat = []
hours = []
for dt,lex,topic in zip(df.Date, df.LexFound, df.topics_found):
    for l,t in zip(lex.split('~'), topic):
        dtflat.append(dt)
        hours.append(dt.hour)
        lexflat.append(l)
        topicflat.append(t)

topic_time = pd.DataFrame({
    'Hate Speech Phrase': lexflat,
    'Topic': topicflat,
    'DateTime': dtflat,
    'Hour': hours
})
topic_time['Date'] = [pd.datetime(d.year, d.month, d.day) for d in topic_time.DateTime]

In [20]:
topic_time.to_csv('res/topic-time.csv', index=False)
print(topic_time.shape, lex_time.shape)
topic_time.head(2)

(1983, 5) (1983, 4)


Unnamed: 0,Hate Speech Phrase,Topic,DateTime,Hour,Date
0,စစ်ခွေး,Ethnic Conflict,2020-03-24 13:14:18,13,2020-03-24
1,စစ်ခွေး,Ethnic Conflict,2020-03-24 13:50:46,13,2020-03-24


## Percentage of Hatespeech

In [21]:
alldf = pd.read_csv(
    '../hsle/data/exportcomments-outputs/20200323_20200325/processed/merged.csv', usecols=['LexFound','Date'])
alldf.columns = ['datetime','lex_found']
alldf['datetime'] = pd.to_datetime(alldf.datetime)
print(alldf.shape)
alldf.head(2)

(58725, 2)


Unnamed: 0,datetime,lex_found
0,2020-03-24 13:01:38,
1,2020-03-24 13:02:29,


In [22]:
alldf['date'] = [
    pd.datetime(d.year, d.month, d.day) for d in alldf.datetime]
alldf.head(2)

Unnamed: 0,datetime,lex_found,date
0,2020-03-24 13:01:38,,2020-03-24
1,2020-03-24 13:02:29,,2020-03-24


In [23]:
no_lex = alldf.loc[alldf.lex_found.isna(),:]
w_lex = alldf.loc[~alldf.lex_found.isna(),:]

In [24]:
nlex = no_lex.groupby('date')['date'].count()
wlex = w_lex.groupby('date')['date'].count()
nlex = nlex[wlex.index]
assert 0==len(set(nlex.index).difference(set(wlex.index)))

In [25]:
ratiodf = pd.DataFrame({
    'No HS': nlex,
    'With HS': wlex,
    'HS%': wlex / (nlex + wlex)
})

In [26]:
ratiodf.to_csv('res/ratio.csv')

## Page-Topics

In [28]:
df.head(2)

Unnamed: 0,Date,LexFound,topics_found
4,2020-03-24 13:14:18,စစ်ခွေး,[Ethnic Conflict]
8,2020-03-24 13:50:46,စစ်ခွေး~ဖင်ယား,"[Ethnic Conflict, Activism]"
