In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pytz
import re
import nltk
import ast
import string
import itertools
import seaborn as sns
from datetime import datetime,timedelta
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from googletrans import Translator

In [2]:
df = pd.read_csv('../Cleaning/Clean_Dataset.csv')
jakarta = pytz.timezone('Asia/Jakarta')
time_date = "%m/%d/%Y %H:%M"
df['datetime_created'] = df['Datetime'].apply(lambda x:datetime.strptime(x,time_date))
df['date_created'] = df['datetime_created'].apply(lambda x:x.date())
df['time_created'] = df['datetime_created'].apply(lambda x:x.time())
df = df.drop(['datetime_created'],axis=1)

In [3]:
len(df)

75940

### Additional Cleaning

In [4]:
#remove duplicate
df = df.drop_duplicates()
df = df.reset_index(drop=True)

In [5]:
pd.set_option('display.max_colwidth', None)
df[df['Clean_Text'].isnull() == True]['Text']

61984    @WabiSabiNFT @HELIX_Metaverse NANI?! @dummybeeean @God_of_Pekka @morisecosoluzi1
Name: Text, dtype: object

In [6]:
df.isnull().sum()

Unnamed: 0          0
Datetime            0
Tweet Id            0
Text                0
Username            0
Location        28393
Clean_Text          1
language            0
date_created        0
time_created        0
dtype: int64

In [7]:
df[df['language']!='in']['language'].value_counts()

id    36523
no     8078
en     6062
et     2697
af     2617
sl     2587
tl     1933
nl     1464
it     1401
so     1369
sq     1245
sv     1181
da     1116
ca      936
sw      921
hr      908
fi      831
cy      830
tr      743
ro      627
lt      399
pl      344
sk      267
fr      162
hu      145
pt      130
es      104
cs       92
lv       78
vi       77
de       73
Name: language, dtype: int64

In [8]:
#translate non-bahasa text to indonesian
def trans(x,src):
    translator = Translator()
    try:
        sentence = translator.translate(x,src=src,dest='id').text
    except:
        sentence = x
    return sentence

In [None]:
df['Clean_Text'] = df.apply(lambda x: trans(x['Clean_Text'],x['language']) if(x['language']!='in') else x['Clean_Text'],axis=1)

In [None]:
df.to_csv('clean_language.csv')

In [None]:
clean_text = df['Clean_Text'].copy()

In [None]:
pd.set_option('display.max_colwidth', 100)
clean_text.tail(15)

### Cleaning Data

In [None]:
def del_word(x, key_list):
    n = len(key_list)
    word_tokens = word_tokenize(x)
    new_x = ''
    for word in word_tokens:
        if word not in key_list:
            new_x = new_x+word+' '
            return new_x
def clean_tweets(text):
    pass

def count_words(x):
    words = word_tokenize(x)
    n=len(words)
    return n

### Word Processing

In [None]:
#Create Word Dictionary
word_dict = {}
for i in range(0,len(df['Clean_Text'])):
    sentence = df['Clean_Text'][i]
    word_token = word_tokenize(sentence)
    for j in word_token:
        if j not in word_dict:
            word_dict[j] = 1
        else:
            word_dict[j] += 1

In [None]:
len(word_dict)

In [None]:
len({k:v for (k,v) in word_dict.items() if v <4})

# Import Lexicon Data
##### sources:

https://github.com/louisowen6/NLP_bahasa_resources
https://github.com/abhimantramb/elang/blob/master/word2vec/utils/swear-words.txt
https://github.com/fajri91/InSet
https://github.com/agusmakmun/SentiStrengthID/blob/master/id_dict/sentimentword.txt

In [None]:
negasi = ['bukan','tidak','ga','gk']
lexicon = pd.read_csv('Lexicon Dictionary/modified_full_lexicon.csv')
lexicon = lexicon.drop(lexicon[(lexicon['word'] == 'bukan')|
                              (lexicon['word'] == 'tidak')|
                              (lexicon['word'] == 'ga')|
                              (lexicon['word'] == 'gk')].index,axis=0)
lexicon = lexicon.reset_index(drop=True)

In [None]:
lexicon_word = lexicon['word'].to_list()
lexicon_num_words = lexicon['number_of_words']

In [None]:
#Check if there is words in dictionary that doesn't included in lexicon
ns_words = []
#create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()
for word in word_dict.keys():
    if word not in lexicon_word:
        kata_dasar = stemmer.stem(word)
        if kata_dasar not in lexicon_word:
            ns_words.append(word)

In [None]:
#Let's take a look what kind of words they are, lets start with some words that have many occurences as this most likely not a type case
len({ k:v for (k,v) in word_dict.items() if ((k in ns_words) & (v>3)) })

In [None]:
ns_words_list = { k:v for (k,v) in word_dict.items() if ((k in ns_words) & (v>3)) }

In [None]:
#sort the most occurences word descending
sort_orders = sorted(ns_words_list.items(), key=lambda x: x[1], reverse = True)
sort_orders = sort_orders[0:20]
for i in sort_orders:
    print(i[0], i[1])

In [None]:
word_to_plot = df['Clean_Text'].copy()

In [None]:
word_to_plot_1 = word_to_plot.apply(lambda x: del_word(x,negasi))

In [None]:
#create a word cloud to see which words that appear often in tweets of metaverse
wordcloud = Wordcloud(width = 800, height = 300, background_color = 'black', 
                      max_words = 1000, min_font_size = 20).generate(str(word_to_plot_1))
#plot the word cloud
fig = plt.figure(figsize = (8,8), facecolor = None)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

### Sentiment