In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pytz
import re
import nltk
import ast
import string
import itertools
import seaborn as sns
from datetime import datetime,timedelta
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [4]:
df = pd.read_csv('../Cleaning/Clean_Dataset.csv', encoding='utf-8')
jakarta = pytz.timezone('Asia/Jakarta')
time_date = "%m/%d/%Y %H:%M"
df['datetime_created'] = df['Datetime'].apply(lambda x:datetime.strptime(x,time_date))
df['date_created'] = df['datetime_created'].apply(lambda x:x.date())
df['time_created'] = df['datetime_created'].apply(lambda x:x.time())
df = df.drop(['datetime_created'],axis=1)

In [3]:
len(df)

41222

### Additional Cleaning

In [6]:
#hapus duplikat
df = df.drop_duplicates()
df = df.reset_index(drop=True)

In [7]:
len(df[df['Clean_Text'].isnull() ==  True])

0

In [8]:
pd.set_option('display.max_colwidth', None)
df[df['Clean_Text'].isnull() == True]['Text']

Series([], Name: Text, dtype: object)

In [9]:
df = df.dropna(subset=['Clean_Text'])
df = df.reset_index(drop=True)

In [10]:
df.isnull().sum()

                             0
Datetime                     0
Tweet Id                     0
Text                         0
Username                     0
Location                 24946
Clean_Text                   0
Unnamed: 7               39908
old_indonlp_sentiment        0
Lexicon_Sentiment            0
Lexicon_Score                0
Stop_Words_Text              0
Stemmed_Text                 0
language                     0
Translated                   0
label_score                  0
score                        0
date_created                 0
time_created                 0
Score_Scaled                 0
indonlp_sentiment            0
dtype: int64

In [12]:
clean_text = df['Clean_Text'].copy()

In [13]:
pd.set_option('display.max_colwidth', 100)
clean_text.tail(15)

41207                                                                pejabat sini bodoh bicara era metaverse
41208                                                                        mungkin cair kalau di metaverse
41209                   walau terus merugi meta bakal lanjutkan proyek metaverse pada tahun berita teknologi
41210    itu ciri kamu yang tidak bisa adaptasi di setiap ruang lingkup pasti ada kebiasan entah itu baik...
41211    anggaran belanja untuk perangkat lunak tapi tidak bisa digunakan dengan maksimal ya buat apa pak...
41212     dapat suntikkan us juta dari sbi dan square enix perusahaan game gumi bakal garap bisnis metaverse
41213    saya tadinya ngira memang ada seperti orang dibalik ini semua yang ngatur mereka ke dunia lain t...
41214                                                     punya dia lagi kembangkan ekosistem metaverse juga
41215    wishnutama selaku founding chairman jagat nusantara optimis bahwa kehadiran metaverse tidak menu...
41216    ekonomi sa

### Cleaning Data

In [16]:
#gunakan slang dan stopwords untuk membersihkan dataset
# sumber slang dan stop words : https://github.com/louisowen6/NLP_bahasa_resources
def clean_tweets(text):
    my_file = open('../Cleaning/cleaning_source/combined_stop_words.txt','r')
    content = my_file.read()
    stop_words = content.split('\n')
    file_2 = open('../Cleaning/cleaning_source/update_combined_slang_words.txt','r')
    content2 = file_2.read()
    slang_words = ast.literal_eval(content2)
    my_file.close()
    file_2.close()
    
    text = text.lower()
    # Remove url
    text = re.sub(r'https?://[^\s]+','',text)
    # Remove hashtag
    text = re.sub(r'#\w+','',text)
    
    text = re.sub(r':', '', text)
    text = re.sub(r'‚Ä¶', '', text)
    #replace consecutive non-ASCII characters with a space
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    
    #remove emojis from text
    #text = emoji_pattern.sub(r'',text)
    
    #remove punctuation
    text = re.sub('[^a-zA-Z]',' ', text)
    
    #remove mentio
    text=re.sub('&lt;/?.*?&gt;","&lt;&gt;','',text)
    
    #remove digits and special chars
    text = re.sub("(\\d|\\W)+"," ",text)
    
    #remove other symbol from tweet
    text = re.sub(r'â', '', text)
    text = re.sub(r'€', '', text)
    text = re.sub(r'¦', '', text)
    
    #modify slang word to make it more suitable
    word_tokens = word_tokenize(text)
    for w in word_tokens:
        if w in slang_words.keys():
            word_tokens[word_tokens.index(w)] = slang_words[w]
    
    #filter using NLTK and append it to a string
    filtered_tweet = [w for w in word_tokens if not w in stop_words]
    filtered_tweet = []
    
    #loop through condition
    for w in word_tokens:
        #check tokens against emoticons, punctuations and stopwords
        if w not in string.punctuation and w not in stop_words:
            filtered_tweet.append(w.lower())
            
    return ' '.join(filtered_tweet)

### Word Processing

# Import Lexicon Data
##### sources:

https://github.com/louisowen6/NLP_bahasa_resources
https://github.com/abhimantramb/elang/blob/master/word2vec/utils/swear-words.txt
https://github.com/fajri91/InSet
https://github.com/agusmakmun/SentiStrengthID/blob/master/id_dict/sentimentword.txt

In [23]:
negasi = ['bukan','tidak','ga','gk','g', 'ngga', 'nggak', 'no']
lexicon = pd.read_csv('Lexicon Dictionary/modified_full_lexicon.csv')
lexicon = lexicon.drop(lexicon[(lexicon['word'] == 'bukan')|
                               (lexicon['word'] == 'tidak')|
                               (lexicon['word'] == 'ga')|
                               (lexicon['word'] == 'gk')|
                               (lexicon['word'] == 'ngga')|
                               (lexicon['word'] == 'nggak')|
                               (lexicon['word'] == 'no')|
                               (lexicon['word'] == 'gk')].index,axis=0)
lexicon = lexicon.reset_index(drop=True)

In [24]:
lexicon_word = lexicon['word'].to_list()
lexicon_num_words = lexicon['number_of_words']

### Sentiment

#### Hitung nilai kata dengan mencocokkan dengan kamus lexicon sementara membuat matrix Bag of Words

In [32]:
sencol =[]
senrow =np.array([])
nsen = 0
factory = StemmerFactory()
stemmer = factory.create_stemmer()
sentiment_list = []
# fungsi untuk menulis nilai kata jika ditemukan
def found_word(ind,words,word,sen,sencol,sentiment,add):
    # jika terdapat pada matrix Bag of Words, tingkatkan nilainya
    if word in sencol:
        sen[sencol.index(word)] += 1
    else:
    # jika tidak, tambahkan kata baru
        sencol.append(word)
        sen.append(1)
        add += 1
    # jika terdapat kata negasi sebelumnya, nilai sentimen akan menjadi negatif
    if (words[ind-1] in negasi):
        sentiment += -lexicon['weight'][lexicon_word.index(word)]
    else:
        sentiment += lexicon['weight'][lexicon_word.index(word)]
    
    return sen,sencol,sentiment,add

In [33]:
# memeriksa setiap kata, jika terdapat pada kamus lexicon, maka hitung nilai sentimennya
for i in range(len(df)):
    nsen = senrow.shape[0]
    words = word_tokenize(df['Clean_Text'][i])
    sentiment = 0 
    add = 0
    prev = [0 for ii in range(len(words))]
    n_words = len(words)
    if len(sencol)>0:
        sen =[0 for j in range(len(sencol))]
    else:
        sen =[]
    
    for word in words:
        ind = words.index(word)
        # Memeriksa apakah terdapat pada kamus lexicon
        if word in lexicon_word :
            sen,sencol,sentiment,add= found_word(ind,words,word,sen,sencol,sentiment,add)
        else:
        # jika tidak, periksa kata dasarnya
            kata_dasar = stemmer.stem(word)
            if kata_dasar in lexicon_word:
                sen,sencol,sentiment,add= found_word(ind,words,kata_dasar,sen,sencol,sentiment,add)
        # jika masih tidak ditemukan, coba gabungkan dengan kata sebelumnya
            elif(n_words>1):
                if ind-1>-1:
                    back_1    = words[ind-1]+' '+word
                    if (back_1 in lexicon_word):
                        sen,sencol,sentiment,add= found_word(ind,words,back_1,sen,sencol,sentiment,add)
                    elif(ind-2>-1):
                        back_2    = words[ind-2]+' '+back_1
                        if back_2 in lexicon_word:
                            sen,sencol,sentiment,add= found_word(ind,words,back_2,sen,sencol,sentiment,add)
    # if there is new word founded, then expand the matrix
    # jika ditemukan kata baru, maka perluas matrix
    if add>0:  
        if i>0:
            if (nsen==0):
                senrow = np.zeros([i,add],dtype=int)
            elif(i!=nsen):
                padding_h = np.zeros([nsen,add],dtype=int)
                senrow = np.hstack((senrow,padding_h))
                padding_v = np.zeros([(i-nsen),senrow.shape[1]],dtype=int)
                senrow = np.vstack((senrow,padding_v))
            else:
                padding =np.zeros([nsen,add],dtype=int)
                senrow = np.hstack((senrow,padding))
            senrow = np.vstack((senrow,sen))
        if i==0:
            senrow = np.array(sen).reshape(1,len(sen))
    # if there isn't then just update the old matrix
    # jika tidak, perbarui matrix lama
    elif(nsen>0):
        senrow = np.vstack((senrow,sen))
        
    sentiment_list.append(sentiment)

In [34]:
sencol.append('lexicon_sentiment')
sentiment_array = np.array(sentiment_list).reshape(senrow.shape[0],1)
sentiment_data = np.hstack((senrow,sentiment_array))
df_sen = pd.DataFrame(sentiment_data,columns = sencol)

In [None]:
df_sen.head(10)

In [40]:
#df_sen = df_sen.drop(df_sen.columns[0], axis=1)

In [46]:
df_sen.head()

Unnamed: 0,kembang,terbaru,termasuk,perusahaan,dua,dunia,katanya,mau,down,melulu,...,fakboi,lebur,jotos,penghalang,terdampar,badung,suri,sembur,alim,lexicon_sentiment
0,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
1,0,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,-2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,28
3,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-2


In [47]:
cek_df = pd.DataFrame([])
cek_df['text'] = df['Clean_Text'].copy()
cek_df['lexicon_sentiment'] = df_sen['lexicon_sentiment'].copy()

In [48]:
cek_df.head(10)

Unnamed: 0,text,lexicon_sentiment
0,cina dan as terus bersaing mengembangkan teknologi terbaru termasuk kali ini perusahaan di kedua...,9
1,katanya mau metaverse tapi down melulu bagaimana mas zuck,-2
2,mungkin facebook sudah berencana menjadi metaverse besar secara sudah menyiapkan diem sebagai st...,28
3,bersaing dengan perusahaan kelas dunia untuk mencapai hal ini sektor publik dan swasta harus ber...,8
4,sama sama semoga selalu terhindar dari usaha tipu menipu di metaverse iya,-2
5,industri metaverse ini kenaikan sahamnya gede banget terbang so busines wise terlihat menjanjikan,12
6,industri metaverse memang sangat menjanjikan,9
7,perusahaan teknologi besar china sudah mengerjakan penawaran metaverse mereka,2
8,dunia metaverse ini dikembangkan menggunakan teknologi blockchain dimana suatu infromasi data ak...,-2
9,contoh beberapa aset tanah yang saya beli di dunia metaverse pavia seharga ada sekitar rupiah,2


In [448]:
df['Lexicon_Score'] = cek_df['lexicon_sentiment']

In [449]:
print(max(df['Lexicon_Score']))
print(min(df['Lexicon_Score']))

62
-71


In [None]:
df.to_csv('../Cleaning/Clean_Dataset.csv', encoding='utf-8')