In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [55]:
vaksin = pd.read_csv('pfizer.csv', encoding="ISO-8859-1")
vaksin.head()

Unnamed: 0,Label,Text
0,Negative,"Wiw efek pfizer kok kek gini, jd takut mo vaks..."
1,Negative,Waktu saya vaksin pfizer yg kedua.. efek vaksi...
2,Negative,vaksin booster kali ini bikin pingsan. vaksin ...
3,Negative,"Vaksin 1 Sinovac : Ngantukkk, Vaksin 2 Sinovac..."
4,Negative,"tadi pagi vaksin booster pfizer, efek pertama ..."


In [56]:
import string
import re
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

# CLEANING MESSAGE

In [57]:
def replace_hyphen(text):
    return re.sub('-', ' ', text)

vaksin['Text'] = vaksin['Text'].apply(replace_hyphen)

def remove_tweet_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
vaksin['Text'] = vaksin['Text'].apply(remove_tweet_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

vaksin['Text'] = vaksin['Text'].apply(remove_number)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

vaksin['Text'] = vaksin['Text'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

vaksin['Text'] = vaksin['Text'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

vaksin['Text'] = vaksin['Text'].apply(remove_singl_char)

#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

#storing the puntuation free text
vaksin['clean_msg'] = vaksin['Text'].apply(lambda x:remove_punctuation(x))
vaksin.head()

Unnamed: 0,Label,Text,clean_msg
0,Negative,"Wiw efek pfizer kok kek gini, jd takut mo vaks...",Wiw efek pfizer kok kek gini jd takut mo vaksi...
1,Negative,Waktu saya vaksin pfizer yg kedua.. efek vaksi...,Waktu saya vaksin pfizer yg kedua efek vaksin ...
2,Negative,vaksin booster kali ini bikin pingsan. vaksin ...,vaksin booster kali ini bikin pingsan vaksin p...
3,Negative,"Vaksin Sinovac : Ngantukkk, Vaksin Sinovac : e...",Vaksin Sinovac Ngantukkk Vaksin Sinovac engg...
4,Negative,"tadi pagi vaksin booster pfizer, efek pertama ...",tadi pagi vaksin booster pfizer efek pertama p...


# LOWER

In [58]:
vaksin['msg_lower'] = vaksin['clean_msg'].apply(lambda x: x.lower());
vaksin.head()

Unnamed: 0,Label,Text,clean_msg,msg_lower
0,Negative,"Wiw efek pfizer kok kek gini, jd takut mo vaks...",Wiw efek pfizer kok kek gini jd takut mo vaksi...,wiw efek pfizer kok kek gini jd takut mo vaksi...
1,Negative,Waktu saya vaksin pfizer yg kedua.. efek vaksi...,Waktu saya vaksin pfizer yg kedua efek vaksin ...,waktu saya vaksin pfizer yg kedua efek vaksin ...
2,Negative,vaksin booster kali ini bikin pingsan. vaksin ...,vaksin booster kali ini bikin pingsan vaksin p...,vaksin booster kali ini bikin pingsan vaksin p...
3,Negative,"Vaksin Sinovac : Ngantukkk, Vaksin Sinovac : e...",Vaksin Sinovac Ngantukkk Vaksin Sinovac engg...,vaksin sinovac ngantukkk vaksin sinovac engg...
4,Negative,"tadi pagi vaksin booster pfizer, efek pertama ...",tadi pagi vaksin booster pfizer efek pertama p...,tadi pagi vaksin booster pfizer efek pertama p...


# TOKEN

In [59]:
from nltk.tokenize import word_tokenize

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

vaksin['token'] = vaksin['msg_lower'].apply(word_tokenize_wrapper)
vaksin.head()

Unnamed: 0,Label,Text,clean_msg,msg_lower,token
0,Negative,"Wiw efek pfizer kok kek gini, jd takut mo vaks...",Wiw efek pfizer kok kek gini jd takut mo vaksi...,wiw efek pfizer kok kek gini jd takut mo vaksi...,"[wiw, efek, pfizer, kok, kek, gini, jd, takut,..."
1,Negative,Waktu saya vaksin pfizer yg kedua.. efek vaksi...,Waktu saya vaksin pfizer yg kedua efek vaksin ...,waktu saya vaksin pfizer yg kedua efek vaksin ...,"[waktu, saya, vaksin, pfizer, yg, kedua, efek,..."
2,Negative,vaksin booster kali ini bikin pingsan. vaksin ...,vaksin booster kali ini bikin pingsan vaksin p...,vaksin booster kali ini bikin pingsan vaksin p...,"[vaksin, booster, kali, ini, bikin, pingsan, v..."
3,Negative,"Vaksin Sinovac : Ngantukkk, Vaksin Sinovac : e...",Vaksin Sinovac Ngantukkk Vaksin Sinovac engg...,vaksin sinovac ngantukkk vaksin sinovac engg...,"[vaksin, sinovac, ngantukkk, vaksin, sinovac, ..."
4,Negative,"tadi pagi vaksin booster pfizer, efek pertama ...",tadi pagi vaksin booster pfizer efek pertama p...,tadi pagi vaksin booster pfizer efek pertama p...,"[tadi, pagi, vaksin, booster, pfizer, efek, pe..."


# SPELL

In [60]:
# ------ Spelling ---------

spell_word = pd.read_excel("spelling.xlsx")

spell_word_dict = {}

for index, row in spell_word.iterrows():
    if row[0] not in spell_word_dict:
        spell_word_dict[row[0]] = row[1] 

def spell_term(document):
    return [spell_word_dict[term] if term in spell_word_dict else term for term in document]

vaksin['spell'] = vaksin['token'].apply(spell_term)
vaksin.head()

Unnamed: 0,Label,Text,clean_msg,msg_lower,token,spell
0,Negative,"Wiw efek pfizer kok kek gini, jd takut mo vaks...",Wiw efek pfizer kok kek gini jd takut mo vaksi...,wiw efek pfizer kok kek gini jd takut mo vaksi...,"[wiw, efek, pfizer, kok, kek, gini, jd, takut,...","[wow, efek, pfizer, kok, seperti, begini, jadi..."
1,Negative,Waktu saya vaksin pfizer yg kedua.. efek vaksi...,Waktu saya vaksin pfizer yg kedua efek vaksin ...,waktu saya vaksin pfizer yg kedua efek vaksin ...,"[waktu, saya, vaksin, pfizer, yg, kedua, efek,...","[waktu, saya, vaksin, pfizer, yang, kedua, efe..."
2,Negative,vaksin booster kali ini bikin pingsan. vaksin ...,vaksin booster kali ini bikin pingsan vaksin p...,vaksin booster kali ini bikin pingsan vaksin p...,"[vaksin, booster, kali, ini, bikin, pingsan, v...","[vaksin, booster, kali, ini, buat, pingsan, va..."
3,Negative,"Vaksin Sinovac : Ngantukkk, Vaksin Sinovac : e...",Vaksin Sinovac Ngantukkk Vaksin Sinovac engg...,vaksin sinovac ngantukkk vaksin sinovac engg...,"[vaksin, sinovac, ngantukkk, vaksin, sinovac, ...","[vaksin, sinovac, mengantuk, vaksin, sinovac, ..."
4,Negative,"tadi pagi vaksin booster pfizer, efek pertama ...",tadi pagi vaksin booster pfizer efek pertama p...,tadi pagi vaksin booster pfizer efek pertama p...,"[tadi, pagi, vaksin, booster, pfizer, efek, pe...","[tadi, pagi, vaksin, booster, pfizer, efek, pe..."


In [61]:
pip install Sastrawi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# FILTER

In [62]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
factory = StopWordRemoverFactory()

Sastrawi_StopWords_id = factory.get_stop_words()

# ---------------------------- manualy add stopword  ------------------------------------
# tambahan
Sastrawi_StopWords_id.extend(['ibu','ayah','adik','kakak','nya','yah','sih','oke','kak','deh','mah','an','ku','mu','iya','apa',
                            'gapapa','akupun','apapun','eh','kah','mengada','apanya','tante','mas','suami','si','mama','bapak',
                            'nder','budhe','kakek','nenek','mbah','wow'])

# ----------------------- add stopword from txt file ------------------------------------
# read txt stopword using pandas
txt_stopword = pd.read_csv("stopwords.txt", names= [" "], header = None)

# convert stopword string to list & append additional stopword
Sastrawi_StopWords_id.extend(txt_stopword[" "][0].split(' '))

# ---------------------------------------------------------------------------------------

Sastrawi_StopWords_id = set(Sastrawi_StopWords_id)

def stopwords_removal(words):
    return [word for word in words if word not in Sastrawi_StopWords_id]

vaksin['filter'] = vaksin['spell'].apply(stopwords_removal)

vaksin.head()

Unnamed: 0,Label,Text,clean_msg,msg_lower,token,spell,filter
0,Negative,"Wiw efek pfizer kok kek gini, jd takut mo vaks...",Wiw efek pfizer kok kek gini jd takut mo vaksi...,wiw efek pfizer kok kek gini jd takut mo vaksi...,"[wiw, efek, pfizer, kok, kek, gini, jd, takut,...","[wow, efek, pfizer, kok, seperti, begini, jadi...","[efek, pfizer, kok, begini, jadi, takut, mau, ..."
1,Negative,Waktu saya vaksin pfizer yg kedua.. efek vaksi...,Waktu saya vaksin pfizer yg kedua efek vaksin ...,waktu saya vaksin pfizer yg kedua efek vaksin ...,"[waktu, saya, vaksin, pfizer, yg, kedua, efek,...","[waktu, saya, vaksin, pfizer, yang, kedua, efe...","[waktu, vaksin, pfizer, kedua, efek, vaksin, l..."
2,Negative,vaksin booster kali ini bikin pingsan. vaksin ...,vaksin booster kali ini bikin pingsan vaksin p...,vaksin booster kali ini bikin pingsan vaksin p...,"[vaksin, booster, kali, ini, bikin, pingsan, v...","[vaksin, booster, kali, ini, buat, pingsan, va...","[vaksin, booster, kali, buat, pingsan, vaksin,..."
3,Negative,"Vaksin Sinovac : Ngantukkk, Vaksin Sinovac : e...",Vaksin Sinovac Ngantukkk Vaksin Sinovac engg...,vaksin sinovac ngantukkk vaksin sinovac engg...,"[vaksin, sinovac, ngantukkk, vaksin, sinovac, ...","[vaksin, sinovac, mengantuk, vaksin, sinovac, ...","[vaksin, sinovac, mengantuk, vaksin, sinovac, ..."
4,Negative,"tadi pagi vaksin booster pfizer, efek pertama ...",tadi pagi vaksin booster pfizer efek pertama p...,tadi pagi vaksin booster pfizer efek pertama p...,"[tadi, pagi, vaksin, booster, pfizer, efek, pe...","[tadi, pagi, vaksin, booster, pfizer, efek, pe...","[tadi, pagi, vaksin, booster, pfizer, efek, pe..."


# STEMM LEMM

In [63]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

#defining a function for stemming
def stemming(text):
  stem_text = [stemmer.stem(word) for word in text]
  return stem_text

vaksin['msg_stemmed'] = vaksin['filter'].apply(lambda x: stemming(x))
vaksin.head()

Unnamed: 0,Label,Text,clean_msg,msg_lower,token,spell,filter,msg_stemmed
0,Negative,"Wiw efek pfizer kok kek gini, jd takut mo vaks...",Wiw efek pfizer kok kek gini jd takut mo vaksi...,wiw efek pfizer kok kek gini jd takut mo vaksi...,"[wiw, efek, pfizer, kok, kek, gini, jd, takut,...","[wow, efek, pfizer, kok, seperti, begini, jadi...","[efek, pfizer, kok, begini, jadi, takut, mau, ...","[efek, pfizer, kok, begini, jadi, takut, mau, ..."
1,Negative,Waktu saya vaksin pfizer yg kedua.. efek vaksi...,Waktu saya vaksin pfizer yg kedua efek vaksin ...,waktu saya vaksin pfizer yg kedua efek vaksin ...,"[waktu, saya, vaksin, pfizer, yg, kedua, efek,...","[waktu, saya, vaksin, pfizer, yang, kedua, efe...","[waktu, vaksin, pfizer, kedua, efek, vaksin, l...","[waktu, vaksin, pfizer, dua, efek, vaksin, lum..."
2,Negative,vaksin booster kali ini bikin pingsan. vaksin ...,vaksin booster kali ini bikin pingsan vaksin p...,vaksin booster kali ini bikin pingsan vaksin p...,"[vaksin, booster, kali, ini, bikin, pingsan, v...","[vaksin, booster, kali, ini, buat, pingsan, va...","[vaksin, booster, kali, buat, pingsan, vaksin,...","[vaksin, booster, kali, buat, pingsan, vaksin,..."
3,Negative,"Vaksin Sinovac : Ngantukkk, Vaksin Sinovac : e...",Vaksin Sinovac Ngantukkk Vaksin Sinovac engg...,vaksin sinovac ngantukkk vaksin sinovac engg...,"[vaksin, sinovac, ngantukkk, vaksin, sinovac, ...","[vaksin, sinovac, mengantuk, vaksin, sinovac, ...","[vaksin, sinovac, mengantuk, vaksin, sinovac, ...","[vaksin, sinovac, antuk, vaksin, sinovac, efek..."
4,Negative,"tadi pagi vaksin booster pfizer, efek pertama ...",tadi pagi vaksin booster pfizer efek pertama p...,tadi pagi vaksin booster pfizer efek pertama p...,"[tadi, pagi, vaksin, booster, pfizer, efek, pe...","[tadi, pagi, vaksin, booster, pfizer, efek, pe...","[tadi, pagi, vaksin, booster, pfizer, efek, pe...","[tadi, pagi, vaksin, booster, pfizer, efek, pe..."


# MENGGABUNGKAN

In [64]:
# lift to string
def parse_string(text):
  return ' '.join([str(item) for item in text])

vaksin['msg_string'] = vaksin['msg_stemmed'].apply(lambda x:parse_string(x))
vaksin.head()

Unnamed: 0,Label,Text,clean_msg,msg_lower,token,spell,filter,msg_stemmed,msg_string
0,Negative,"Wiw efek pfizer kok kek gini, jd takut mo vaks...",Wiw efek pfizer kok kek gini jd takut mo vaksi...,wiw efek pfizer kok kek gini jd takut mo vaksi...,"[wiw, efek, pfizer, kok, kek, gini, jd, takut,...","[wow, efek, pfizer, kok, seperti, begini, jadi...","[efek, pfizer, kok, begini, jadi, takut, mau, ...","[efek, pfizer, kok, begini, jadi, takut, mau, ...",efek pfizer kok begini jadi takut mau vaksin tiga
1,Negative,Waktu saya vaksin pfizer yg kedua.. efek vaksi...,Waktu saya vaksin pfizer yg kedua efek vaksin ...,waktu saya vaksin pfizer yg kedua efek vaksin ...,"[waktu, saya, vaksin, pfizer, yg, kedua, efek,...","[waktu, saya, vaksin, pfizer, yang, kedua, efe...","[waktu, vaksin, pfizer, kedua, efek, vaksin, l...","[waktu, vaksin, pfizer, dua, efek, vaksin, lum...",waktu vaksin pfizer dua efek vaksin lumayan pa...
2,Negative,vaksin booster kali ini bikin pingsan. vaksin ...,vaksin booster kali ini bikin pingsan vaksin p...,vaksin booster kali ini bikin pingsan vaksin p...,"[vaksin, booster, kali, ini, bikin, pingsan, v...","[vaksin, booster, kali, ini, buat, pingsan, va...","[vaksin, booster, kali, buat, pingsan, vaksin,...","[vaksin, booster, kali, buat, pingsan, vaksin,...",vaksin booster kali buat pingsan vaksin pakai ...
3,Negative,"Vaksin Sinovac : Ngantukkk, Vaksin Sinovac : e...",Vaksin Sinovac Ngantukkk Vaksin Sinovac engg...,vaksin sinovac ngantukkk vaksin sinovac engg...,"[vaksin, sinovac, ngantukkk, vaksin, sinovac, ...","[vaksin, sinovac, mengantuk, vaksin, sinovac, ...","[vaksin, sinovac, mengantuk, vaksin, sinovac, ...","[vaksin, sinovac, antuk, vaksin, sinovac, efek...",vaksin sinovac antuk vaksin sinovac efek vaksi...
4,Negative,"tadi pagi vaksin booster pfizer, efek pertama ...",tadi pagi vaksin booster pfizer efek pertama p...,tadi pagi vaksin booster pfizer efek pertama p...,"[tadi, pagi, vaksin, booster, pfizer, efek, pe...","[tadi, pagi, vaksin, booster, pfizer, efek, pe...","[tadi, pagi, vaksin, booster, pfizer, efek, pe...","[tadi, pagi, vaksin, booster, pfizer, efek, pe...",tadi pagi vaksin booster pfizer efek pertama p...


# NGRAM

In [65]:
# N-GRAM
def n_gram(text, n: int = 3):
    txt = text.split( )
    #txt = text[0].split(' ')
    result = list()
    
    index = 0
    for t in txt:
        temp = []
        for i in range(n):
            if (index + i) >= len(txt):
                break
            temp.append(txt[index + i])
        
        if len(temp) == n:
            string = ' '.join([str(item) for item in temp])
            result.append(string)
        index += 1
                
    return result

vaksin['msg_n_gram'] = vaksin['msg_string'].apply(lambda x:n_gram(x, 3))
vaksin.head()

Unnamed: 0,Label,Text,clean_msg,msg_lower,token,spell,filter,msg_stemmed,msg_string,msg_n_gram
0,Negative,"Wiw efek pfizer kok kek gini, jd takut mo vaks...",Wiw efek pfizer kok kek gini jd takut mo vaksi...,wiw efek pfizer kok kek gini jd takut mo vaksi...,"[wiw, efek, pfizer, kok, kek, gini, jd, takut,...","[wow, efek, pfizer, kok, seperti, begini, jadi...","[efek, pfizer, kok, begini, jadi, takut, mau, ...","[efek, pfizer, kok, begini, jadi, takut, mau, ...",efek pfizer kok begini jadi takut mau vaksin tiga,"[efek pfizer kok, pfizer kok begini, kok begin..."
1,Negative,Waktu saya vaksin pfizer yg kedua.. efek vaksi...,Waktu saya vaksin pfizer yg kedua efek vaksin ...,waktu saya vaksin pfizer yg kedua efek vaksin ...,"[waktu, saya, vaksin, pfizer, yg, kedua, efek,...","[waktu, saya, vaksin, pfizer, yang, kedua, efe...","[waktu, vaksin, pfizer, kedua, efek, vaksin, l...","[waktu, vaksin, pfizer, dua, efek, vaksin, lum...",waktu vaksin pfizer dua efek vaksin lumayan pa...,"[waktu vaksin pfizer, vaksin pfizer dua, pfize..."
2,Negative,vaksin booster kali ini bikin pingsan. vaksin ...,vaksin booster kali ini bikin pingsan vaksin p...,vaksin booster kali ini bikin pingsan vaksin p...,"[vaksin, booster, kali, ini, bikin, pingsan, v...","[vaksin, booster, kali, ini, buat, pingsan, va...","[vaksin, booster, kali, buat, pingsan, vaksin,...","[vaksin, booster, kali, buat, pingsan, vaksin,...",vaksin booster kali buat pingsan vaksin pakai ...,"[vaksin booster kali, booster kali buat, kali ..."
3,Negative,"Vaksin Sinovac : Ngantukkk, Vaksin Sinovac : e...",Vaksin Sinovac Ngantukkk Vaksin Sinovac engg...,vaksin sinovac ngantukkk vaksin sinovac engg...,"[vaksin, sinovac, ngantukkk, vaksin, sinovac, ...","[vaksin, sinovac, mengantuk, vaksin, sinovac, ...","[vaksin, sinovac, mengantuk, vaksin, sinovac, ...","[vaksin, sinovac, antuk, vaksin, sinovac, efek...",vaksin sinovac antuk vaksin sinovac efek vaksi...,"[vaksin sinovac antuk, sinovac antuk vaksin, a..."
4,Negative,"tadi pagi vaksin booster pfizer, efek pertama ...",tadi pagi vaksin booster pfizer efek pertama p...,tadi pagi vaksin booster pfizer efek pertama p...,"[tadi, pagi, vaksin, booster, pfizer, efek, pe...","[tadi, pagi, vaksin, booster, pfizer, efek, pe...","[tadi, pagi, vaksin, booster, pfizer, efek, pe...","[tadi, pagi, vaksin, booster, pfizer, efek, pe...",tadi pagi vaksin booster pfizer efek pertama p...,"[tadi pagi vaksin, pagi vaksin booster, vaksin..."


In [66]:
vaksin.to_csv('nbgrampfizer.csv', index=False)