In [1]:
#Dependencies
import pandas as pd
from pandas import DataFrame

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

In [3]:
df = pd.read_csv("Data skripsi/dataset_ulasan_full.csv")

In [4]:
replacers = {'sy':'saya','gak':'enggak','ssah':'susah','bgt':'banget','skrng':'sekarang','yg':'yang','gw':'gue',
             'baguuus':'bagus','mmbantu':'membantu', 'parahg':'parah'}


def word_normalization(doc):
    for key in replacers.keys():
        doc = doc.replace(key,replacers[key])
    return doc

In [5]:
def words_normalization2(sentence, dictionary):
    return " ".join([dictionary.get(w,w) for w in sentence.split()])

In [6]:
#clean the data
stop = set(stopwords.words('indonesian'))
exclude = set(string.punctuation + '1234567890')

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def preprocess1(text):
    case_folded = text.lower()
    symbol_removed = ''.join(ch for ch in case_folded if ch not in exclude)
    word_normalized = words_normalization2(symbol_removed, replacers)
    tokenized = word_tokenize(word_normalized)
    #stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    #stemmed = ' '.join([stemmer.stem(word) for word in num_and_punc_free.split()])
    
    return tokenized

In [7]:
df['preprocessed1_ulasan'] = df['Ulasan'].apply(preprocess1)

In [8]:
from nltk.tag import CRFTagger
ct = CRFTagger()
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

pos_ulasan = ct.tag_sents(df.preprocessed1_ulasan)

In [9]:
noun_data = list()
for i,text in enumerate(pos_ulasan, start=0):
    temp = list()
    for term in text:
        if term[1] == 'VB' or term[1] == 'NN' or term[1] == 'JJ' or term[1] == 'RB' or term[1] == 'NEG' or term[1] == 'NNP':
        #if term[1] == 'NNP' or term[1] == 'NN':
            temp.append(term[0])
    noun_data.append(temp)

df['pos_ulasan'] = df['Ulasan']

for i in range(len(df['pos_ulasan'])):
    df['pos_ulasan'][i] = noun_data[i]

In [10]:
#def neg_handler(doc):
 #   for key in neg_word.keys():
  #      doc = doc.replace(key,neg_word[key])
   # return doc

def preprocess2(text):
    text = TreebankWordDetokenizer().detokenize(text)
    stop_removed = ' '.join([word for word in text.split() if word not in stop])
    stemmed = ' '.join([stemmer.stem(word) for word in stop_removed.split()])
    tokenized = word_tokenize(stemmed)
    return tokenized

In [11]:
df['preprocessed2_ulasan'] = df['pos_ulasan'].apply(preprocess2)

In [13]:
df['pos_ulasan'].head()

0    [aplikasi, sangat, membantu, saat, pandemi, ap...
1                  [memesan, perjalanan, lebih, mudah]
2    [kai, smakin, bagus, pelayananya, krena, situa...
3              [kedepannya, lebih, menyenangkan, lagi]
4                                             [merasa]
Name: pos_ulasan, dtype: object

In [14]:
def detokenize(tokens):
    return TreebankWordDetokenizer().detokenize(tokens)

df['preprocessed3_ulasan'] = df['pos_ulasan'].apply(detokenize)

In [15]:
df['preprocessed3_ulasan'].replace("", float("NaN"), inplace=True)
df.dropna(subset=['preprocessed3_ulasan'], inplace=True)

In [16]:
df['Ulasan'] = df['preprocessed3_ulasan']

In [18]:
learnability = df.Learnability
efficiency = df.Efficiency
memorability = df.Memorability
errors = df.Errors
satisfaction = df.Satisfaction
ulasan = df.Ulasan
new_df = pd.concat([learnability,efficiency,memorability,errors,satisfaction,ulasan], axis=1)

In [None]:
new_df.to_csv('Data skripsi/preprocessed_data_sentimen.csv', index=False)