# Import Packages

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime,timedelta
import pytz 
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import ast
import string
from wordcloud import WordCloud
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from googletrans import Translator

# Import Data

In [None]:
df = pd.read_csv('data/data_extraction/pildun3.csv')

jakarta = pytz.timezone('Asia/Jakarta')
df['datetime_created'] = df['created_at'].apply(lambda x: datetime.strptime(x,'%a %b %d %H:%M:%S %z %Y').replace(tzinfo=pytz.UTC).astimezone(jakarta))
df['date_created'] = df['datetime_created'].apply(lambda x: x.date())
df['time_created'] = df['datetime_created'].apply(lambda x: x.time())
df = df.drop(['datetime_created'],axis=1)

In [None]:
df.head(10)

In [None]:
len(df)

Removing Duplicate if any

In [None]:
df = df.drop_duplicates()
df = df.reset_index(drop=True)

In [None]:
len(df)

# Extra Cleaning

In [None]:
len(df[df['clean_text'].isnull()==True])

In [None]:
pd.set_option('display.max_colwidth', None)
df[df['clean_text'].isnull()==True]['original_text']

Menghapus pengamatan bahwa teks menjadi kosong setelah dibersihkan, menunjukkan bahwa teks tidak memiliki sentimen penting

In [None]:
df = df.dropna(subset=['clean_text'])
df = df.reset_index(drop=True)

In [None]:
df.isnull().sum()

In [None]:
df[df['lang']!='in']['lang'].value_counts()

Karena ada beberapa kata yang tidak dalam bahasa Indonesia, mungkin orang Indonesia menggunakan bahasa lain untuk tweet, teks tersebut akan diterjemahkan untuk memproses teks

In [None]:
def trans(x,src):
    translator = Translator()
    try:
        sentence = translator.translate(x, src=src,dest='id').text
    except:
        sentence = x
    return sentence

In [None]:
df['clean_text'] = df.apply(lambda x: trans(x['clean_text'],x['lang']) if(x['lang']!='in') else x['clean_text'],axis=1)

In [None]:
clean_text = df['clean_text'].copy()

In [None]:
pd.set_option('display.max_colwidth', 100)
clean_text.tail(15)

In [None]:
def repair_exaggeration(x):
    word_tokens = word_tokenize(x)
    new_x =''
    for i in word_tokens:
        if (i =='psbb'):
            new = re.sub(r'(\w)\1\1+',r'\1\1',i)
            new_x = new_x +new+' '
        elif(i =='psb'):
            new = 'psbb'
            new_x = new_x +new+' '
        else:
            new = re.sub(r'(\w)\1\1\1+',r'\1',i)
            new_x = new_x +new+' '
    return new_x

def del_word(x,key_list):
    n = len(key_list)
    word_tokens = word_tokenize(x)
    new_x =''
    for word in word_tokens:
        if word not in key_list:
            new_x = new_x+word+' '
    return new_x

def clean_tweets(tweet):
   # nltk.download('stopwords')
    my_file = open("cleaning_source/combined_stop_words.txt", "r")
    content = my_file.read()
    stop_words = content.split("\n")
    file_2  = open("cleaning_source/update_combined_slang_words.txt", "r")
    content2 = file_2.read()
    slang_words = ast.literal_eval(content2)
    my_file.close()
    file_2.close()

    tweet = tweet.lower()
    #after tweepy preprocessing the colon left remain after removing mentions
    #or RT sign in the beginning of the tweet
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub(r'‚Ä¶', '', tweet)
    #replace consecutive non-ASCII characters with a space
    tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)

    #remove emojis from tweet
    #tweet = emoji_pattern.sub(r'', tweet)
    
    #remove punctuation manually
    tweet = re.sub('[^a-zA-Z]', ' ', tweet)
    
    #remove tags
    tweet=re.sub("&lt;/?.*?&gt;","&lt;&gt;",tweet)
    
    #remove digits and special chars
    tweet=re.sub("(\\d|\\W)+"," ",tweet)

    #remove other symbol from tweet
    tweet = re.sub(r'â', '', tweet)
    tweet = re.sub(r'€', '', tweet)
    tweet = re.sub(r'¦', '', tweet)

    word_tokens = word_tokenize(tweet)
    for w in word_tokens:
        if w in slang_words.keys():
            word_tokens[word_tokens.index(w)] = slang_words[w]

    #filter using NLTK library append it to a string
    filtered_tweet = [w for w in word_tokens if not w in stop_words]
    filtered_tweet = []

    #looping through conditions
    for w in word_tokens:
        #check tokens against stop words , emoticons and punctuations
        if w not in stop_words and w not in string.punctuation:
            filtered_tweet.append(w.lower())
    return ' '.join(filtered_tweet)

def count_words(x):
    words = word_tokenize(x)
    n=len(words)
    return n

Cleaning text that exaggerate the typing such as 'psbbbbbbbbb' 

In [None]:
clean_text_exag = clean_text.apply(lambda x: repair_exaggeration(x))

In [None]:
clean_text_exag.tail(15)

Recleaning after remove exaggeration

In [None]:
re_clean = clean_text_exag.apply(lambda x: clean_tweets(x))

Kami menggunakan kata kunci untuk meminta data, sekarang kami perlu menghapusnya karena semuanya akan ditemukan di setiap kalimat dalam bingkai data ini

In [None]:
keyword = ['psbb','psb','corona','covid19','indonesia','pemerintah','wfh','covid']
clean_text_extra = re_clean.apply(lambda x: del_word(x,keyword))

In [None]:
clean_text_extra.tail(15)

In [None]:
df['clean_text'] = clean_text_extra

In [None]:
df['word_length'] = df['clean_text'].apply(lambda x:count_words(x))

In [None]:
df['word_length'].value_counts().sort_index()

In [None]:
df = df.drop(df[df['word_length']==0].index,axis=0)
df = df.reset_index(drop=True)

# Word Processing

## Create word dictionary

In [None]:
word_dict = {}
for i in range(0,len(df['clean_text'])):
    sentence = df['clean_text'][i]
    word_token = word_tokenize(sentence)
    for j in word_token:
        if j not in word_dict:
            word_dict[j] = 1
        else:
            word_dict[j] += 1

In [None]:
len(word_dict)

In [None]:
len({k:v for (k,v) in word_dict.items() if v < 4})

## Import Lexicon data

Impor leksikon, dan hapus kata -kata negasi dari leksikon, leksikon adalah kombinasi dari beberapa sumber di bawah ini, yang digabungkan bersama, dan termasuk kata -kata bersumpah yang memiliki skor paling negatif

sources : <br>
https://github.com/louisowen6/NLP_bahasa_resources <br>
https://github.com/abhimantramb/elang/blob/master/word2vec/utils/swear-words.txt <br>
https://github.com/fajri91/InSet <br>
https://github.com/agusmakmun/SentiStrengthID/blob/master/id_dict/sentimentword.txt 

In [None]:
negasi = ['bukan','tidak','ga','gk']
lexicon = pd.read_csv('lexicon/modified_full_lexicon.csv')
lexicon = lexicon.drop(lexicon[(lexicon['word'] == 'bukan')
                               |(lexicon['word'] == 'tidak')
                               |(lexicon['word'] == 'ga')|(lexicon['word'] == 'gk') ].index,axis=0)
lexicon = lexicon.reset_index(drop=True)

In [None]:
len(lexicon)

In [None]:
lexicon.head(10)

In [None]:
lexicon_word = lexicon['word'].to_list()
lexicon_num_words = lexicon['number_of_words']

In [None]:
len(lexicon_word)

Memeriksa apakah ada kata -kata dalam kamus yang tidak termasuk dalam leksikon

In [None]:
ns_words = []
factory = StemmerFactory()
stemmer = factory.create_stemmer()
for word in word_dict.keys():
    if word not in lexicon_word:
        kata_dasar = stemmer.stem(word)
        if kata_dasar not in lexicon_word:
            ns_words.append(word)
len(ns_words)

Mari kita lihat kata -kata seperti apa mereka, mari kita mulai dengan beberapa kata yang memiliki banyak kejadian karena ini kemungkinan besar bukan tipe case

In [None]:
len({k:v for (k,v) in word_dict.items() if ((k in ns_words)&(v>3)) })

In [None]:
ns_words_list = {k:v for (k,v) in word_dict.items() if ((k in ns_words)&(v>3))}

Ternyata kata -kata yang tidak termasuk dalam leksikon, adalah orang yang tidak memiliki sentimen signifikan

In [None]:
sort_orders = sorted(ns_words_list.items(), key=lambda x: x[1], reverse=True)
sort_orders=sort_orders[0:20]
for i in sort_orders:
    print(i[0], i[1])

In [None]:
word_to_plot = df['clean_text'].copy()

In [None]:
word_to_plot_1 = word_to_plot.apply(lambda x: del_word(x,negasi))

Membuat kata cloud untuk melihat kata -kata seperti apa yang sering muncul di tweet yang terkait dengan pandemi

In [None]:
wordcloud = WordCloud(width = 800, height = 800, background_color = 'black', max_words = 1000
                      , min_font_size = 20).generate(str(word_to_plot_1))
#plot the word cloud
fig = plt.figure(figsize = (8,8), facecolor = None)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

## sentiment

In [None]:
lexicon['number_of_words'].value_counts()

In [None]:
'pekerti' in word_dict

In [None]:
'budi baik' in lexicon_word

Menghitung sentimen kata -kata dengan memasukkannya ke leksikon sambil juga membuat kantong kata -kata matriks

In [None]:
sencol =[]
senrow =np.array([])
nsen = 0
factory = StemmerFactory()
stemmer = factory.create_stemmer()
sentiment_list = []
# function to write the word's sentiment if it is founded
def found_word(ind,words,word,sen,sencol,sentiment,add):
    # if it is already included in the bag of words matrix, then just increase the value
    if word in sencol:
        sen[sencol.index(word)] += 1
    else:
    #if not, than add new word
        sencol.append(word)
        sen.append(1)
        add += 1
    #if there is a negation word before it, the sentiment would be the negation of it's sentiment
    if (words[ind-1] in negasi):
        sentiment += -lexicon['weight'][lexicon_word.index(word)]
    else:
        sentiment += lexicon['weight'][lexicon_word.index(word)]
    
    return sen,sencol,sentiment,add
            
# checking every words, if they are appear in the lexicon, and then calculate their sentiment if they do
for i in range(len(df)):
    nsen = senrow.shape[0]
    words = word_tokenize(df['clean_text'][i])
    sentiment = 0 
    add = 0
    prev = [0 for ii in range(len(words))]
    n_words = len(words)
    if len(sencol)>0:
        sen =[0 for j in range(len(sencol))]
    else:
        sen =[]
    
    for word in words:
        ind = words.index(word)
        # check whether they are included in the lexicon
        if word in lexicon_word :
            sen,sencol,sentiment,add= found_word(ind,words,word,sen,sencol,sentiment,add)
        else:
        # if not, then check the root word
            kata_dasar = stemmer.stem(word)
            if kata_dasar in lexicon_word:
                sen,sencol,sentiment,add= found_word(ind,words,kata_dasar,sen,sencol,sentiment,add)
        # if still negative, try to match the combination of words with the adjacent words
            elif(n_words>1):
                if ind-1>-1:
                    back_1    = words[ind-1]+' '+word
                    if (back_1 in lexicon_word):
                        sen,sencol,sentiment,add= found_word(ind,words,back_1,sen,sencol,sentiment,add)
                    elif(ind-2>-1):
                        back_2    = words[ind-2]+' '+back_1
                        if back_2 in lexicon_word:
                            sen,sencol,sentiment,add= found_word(ind,words,back_2,sen,sencol,sentiment,add)
    # if there is new word founded, then expand the matrix
    if add>0:  
        if i>0:
            if (nsen==0):
                senrow = np.zeros([i,add],dtype=int)
            elif(i!=nsen):
                padding_h = np.zeros([nsen,add],dtype=int)
                senrow = np.hstack((senrow,padding_h))
                padding_v = np.zeros([(i-nsen),senrow.shape[1]],dtype=int)
                senrow = np.vstack((senrow,padding_v))
            else:
                padding =np.zeros([nsen,add],dtype=int)
                senrow = np.hstack((senrow,padding))
            senrow = np.vstack((senrow,sen))
        if i==0:
            senrow = np.array(sen).reshape(1,len(sen))
    # if there isn't then just update the old matrix
    elif(nsen>0):
        senrow = np.vstack((senrow,sen))
        
    sentiment_list.append(sentiment)

In [None]:
len(sentiment_list)

In [None]:
print(senrow.shape[0])

Membangun bingkai data yang berisi kantong kata dan sentimen yang telah dihitung sebelumnya

In [None]:
sencol.append('sentiment')
sentiment_array = np.array(sentiment_list).reshape(senrow.shape[0],1)
sentiment_data = np.hstack((senrow,sentiment_array))
df_sen = pd.DataFrame(sentiment_data,columns = sencol)

In [None]:
df_sen.head(10)

Mari kita lihat apakah sentimennya benar dengan melihat teks aslinya

In [None]:
cek_df = pd.DataFrame([])
cek_df['text'] = df['original_text'].copy()
cek_df['sentiment']  = df_sen['sentiment'].copy()

In [None]:
cek_df.head(10)

# EDA

In [None]:
sns.set(style="white", palette="muted", color_codes=True)
sns.kdeplot(df_sen['sentiment'],color='m',shade=True)
plt.title('Sentiment Distribution')
plt.xlabel('sentiment')

In [None]:
sns.set(style="whitegrid") 
sns.boxplot(x=df_sen['sentiment'])

Sepertinya sentimen terdistribusi secara merata antara positif dan negatif, tentu saja, mari kita lihat rata -rata

In [None]:
df_sen.describe()

Sepertinya hampir didistribusikan secara merata, tetapi yang positif memiliki kejadian yang sedikit lebih besar di sini

Sekarang mari kita lihat korelasi antara kata -kata yang termasuk dalam sentimen

In [None]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=10):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr

#print("Top Absolute Correlations")
#print(get_top_abs_correlations(df_sen, 10))
au = get_top_abs_correlations(df_sen, 15)

In [None]:
print('Perfect Correlation')
au[au==1]

Ternyata ada 156 kata yang selalu terjadi bersama dalam setiap teks, meskipun kita tidak benar -benar melihat berapa banyak kalimat yang termasuk di sana tetapi korelasinya cukup tinggi

In [None]:
top10 = au[au<float(1)][0:10]
label = top10.index
label_list =[]
for i in label:
    for j in i:
        if(j not in label_list):
            label_list.append(j)
            
df_sen_corr = df_sen[label_list]
corr = df_sen_corr.corr()
for i in label_list:
    for j in label_list:
        if i!=j:
            corr[i][j] = round(corr[i][j],3)
            

Sekarang mari kita lihat kata lain, itu sekarang selalu tetapi sering bersatu karena korelasi cukup tinggi meskipun tidak sama dengan 1

In [None]:
plt.figure(figsize=(15,15))

h = sns.heatmap(corr, annot=True,vmin=-1, vmax=1, center= 0)

plt.show()

In [None]:
top15 = au[au<float(1)][0:15]

Sepertinya kebanyakan dari mereka secara alami berkumpul bersama tetapi ada beberapa yang tidak benar -benar terlintas dalam pikiran

Sekarang mari kita lihat kata -kata yang paling terjadi di antara serangkaian kata yang termasuk dalam leksikon

In [None]:
top15_word = df_sen.drop(['sentiment'],axis=1).sum().sort_values(ascending=False)[0:15]

In [None]:
#pal =sns.dark_palette("purple", input="xkcd",n_colors=15)
pal =sns.light_palette("navy", reverse=True,n_colors=15)
g = sns.barplot(y = top15_word.index , x = top15_word,palette=pal)
g.grid(False)
plt.xlabel('Occurences')
plt.ylabel('Words')
plt.title("Top 15 Most Often Occured Words",fontweight='bold') 
for i in range(15):
    g.text(top15_word[i],i+0.22, top15_word[i],color='black')
plt.show()

## Beyond Words

Sekarang kami ingin mengeksplorasi lebih dari kata itu sendiri, maka kami meneruskan sentimen ke dalam dataset asli dan kemudian mengeksplorasi beberapa data di sana

In [None]:
df['sentiment'] = df_sen['sentiment']

In [None]:
df.head(5)

In [None]:
df.isnull().sum()

Mari kita lihat faktor lain yang berkorelasi dengan sentimen itu sendiri

In [None]:
plt.figure()
plt.title('correlation between numerical data',fontweight='bold')
df_corr = df.corr()
matrix = np.triu(df.corr())
cmap =  sns.cubehelix_palette(light=0.5, as_cmap=True)
h = sns.heatmap(df_corr, annot=True,vmin=-1, vmax=1, center= 0,mask=matrix,cmap = cmap)

plt.show()

Dari fitur numerik, tampaknya korelasinya sangat rendah, sekarang mari kita lihat yang lain

In [None]:
sns.set(style="white", color_codes=True)
plt.figure(figsize=(10,8))
plt.title('Sentiment in every language used',fontweight='bold')
l = sns.boxplot(x='lang',y='sentiment',data=df,palette= sns.color_palette("RdPu", 10))

Sepertinya orang -orang dari kerangka waktu ini bahasa 'ko' dan 'und' selalu digunakan untuk memberikan sentimen positif sementara itu sebaliknya untuk 'pt' dan 'es'

In [None]:
cek_df = df.dropna(subset=['possibly_sensitive'])
cek_df = cek_df.reset_index(drop=True)
plt.figure(figsize=(8,8))
g = sns.boxplot(x='possibly_sensitive',y='sentiment',data=cek_df)
plt.show()

Konten sensitif tidak menunjukkan pernyataan karena mereka hampir terdistribusi secara merata di antara mereka

In [None]:
df_place = df.groupby(['place']).mean().sort_values(by='sentiment',ascending=False)
df_place = df_place.reset_index()

In [None]:
df_place_dict = df.groupby(['place']).count().sort_values(by='id',ascending=False)['id'].to_dict()

In [None]:
df_place['number_of_tweets'] =  df_place.apply(lambda x:df_place_dict[x['place']],axis=1)

In [None]:
top10_place_pos = df_place.sort_values(by='sentiment',ascending=False)[0:10].reset_index(drop=True)
top10_place_neg = df_place.sort_values(by='sentiment',ascending=True)[0:10].reset_index(drop=True)
top10_place     = df_place.sort_values(by='number_of_tweets',ascending=False)[0:10].reset_index(drop=True)

Sekarang mari kita lihat beberapa tempat di mana sentimen yang dibuat darinya cenderung sensitif dan juga untuk tempat -tempat yang membuat sebaliknya

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1,figsize=(10,10))
fig.suptitle('Most Positive and Most Negative Sentiment Place',fontweight='bold')
h = sns.barplot(y='place',x='sentiment',data=top10_place_pos,ax=ax1,palette=sns.color_palette("Blues_d",n_colors=10))
n = sns.barplot(y='place',x='sentiment',data=top10_place_neg,ax=ax2,palette=sns.color_palette('RdPu_d',n_colors=10))
ax1.set_title('Top 10 Positive')
ax2.set_title('Top 10 Negative')
plt.show()

last but not least, let's take a look at some places where tweets is most often come from

In [None]:
pal =sns.dark_palette("green", input="xkcd",n_colors=10)
g = sns.barplot(y = top10_place['place'] , x = top10_place['number_of_tweets'],palette=pal)
g.grid=False
plt.xlabel('number of tweets')
plt.ylabel('place')
plt.title("Top 10 Number of Tweets place",fontweight='bold') 

for i in range(10):
    g.text(top10_place['number_of_tweets'][i], i+0.22 ,round(top10_place['sentiment'][i],3),color='black')
    
plt.show()