In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("data_tugas1.csv", index_col="ID")
data

Unnamed: 0_level_0,Teks,Label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
R1,Penjualnya ramah dan pengiriman cepat kereeen,Positif
R2,"dijual harga murah tapi tidak murahan, wajib d...",Positif
R3,SEBELLL!!!! mending cari penjual lain aja ??????,Negatif
R4,Jualannya murahan tapi harga mahal ????,Negatif
R5,"Lama dikirimnya, padahal udah dari lama ordernya",Negatif


# Case Folding

In [3]:
# Case folding
data['Teks'] = data['Teks'].str.lower()

# Tokenizing

In [4]:
import string 
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

# ------ Tokenizing ---------

def remove_sentence_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
                
data['Teks'] = data['Teks'].apply(remove_sentence_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

data['Teks'] = data['Teks'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))

data['Teks'] = data['Teks'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

data['Teks'] = data['Teks'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

data['Teks'] = data['Teks'].apply(remove_whitespace_multiple)

# remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

data['Teks'] = data['Teks'].apply(remove_singl_char)

# NLTK word rokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

data['sentence_tokens'] = data['Teks'].apply(word_tokenize_wrapper)

# Filtering

In [5]:
from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------
# get stopword indonesia
list_stopwords = stopwords.words('indonesian')


# ---------------------------- manualy add stopword  ------------------------------------
# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)


#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

data['sentence_tokens_WSW'] = data['sentence_tokens'].apply(stopwords_removal) 

In [6]:
data

Unnamed: 0_level_0,Teks,Label,sentence_tokens,sentence_tokens_WSW
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
R1,penjualnya ramah dan pengiriman cepat kereeen,Positif,"[penjualnya, ramah, dan, pengiriman, cepat, ke...","[penjualnya, ramah, pengiriman, cepat, kereeen]"
R2,dijual harga murah tapi tidak murahan wajib di...,Positif,"[dijual, harga, murah, tapi, tidak, murahan, w...","[dijual, harga, murah, murahan, wajib, dibeli]"
R3,sebelll mending cari penjual lain aja,Negatif,"[sebelll, mending, cari, penjual, lain, aja]","[sebelll, mending, cari, penjual]"
R4,jualannya murahan tapi harga mahal,Negatif,"[jualannya, murahan, tapi, harga, mahal]","[jualannya, murahan, harga, mahal]"
R5,lama dikirimnya padahal udah dari lama ordernya,Negatif,"[lama, dikirimnya, padahal, udah, dari, lama, ...","[dikirimnya, udah, ordernya]"


# Normalization

In [7]:
normalizad_word_dict = {}
normalizad_word_dict["kereeen"] = "keren"
normalizad_word_dict["sebelll"] = "sebel"

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

data['data_normalized'] = data['sentence_tokens_WSW'].apply(normalized_term)

data['data_normalized']



ID
R1     [penjualnya, ramah, pengiriman, cepat, keren]
R2    [dijual, harga, murah, murahan, wajib, dibeli]
R3                   [sebel, mending, cari, penjual]
R4                [jualannya, murahan, harga, mahal]
R5                      [dikirimnya, udah, ordernya]
Name: data_normalized, dtype: object

# Stemmer

In [8]:
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in data['data_normalized']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    if term != stemmed_wrapper(term):
        print(term,":" ,stemmed_wrapper(term))
    term_dict[term] = stemmed_wrapper(term)
    
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

data['data_tokens_stemmed'] = data['data_normalized'].apply(get_stemmed_term)
print(data['data_tokens_stemmed'])

20
------------------------
penjualnya : jual
pengiriman : kirim
dijual : jual
murahan : murah
dibeli : beli
penjual : jual
jualannya : jual
dikirimnya : kirim
ordernya : order
{'penjualnya': 'jual', 'ramah': 'ramah', 'pengiriman': 'kirim', 'cepat': 'cepat', 'keren': 'keren', 'dijual': 'jual', 'harga': 'harga', 'murah': 'murah', 'murahan': 'murah', 'wajib': 'wajib', 'dibeli': 'beli', 'sebel': 'sebel', 'mending': 'mending', 'cari': 'cari', 'penjual': 'jual', 'jualannya': 'jual', 'mahal': 'mahal', 'dikirimnya': 'kirim', 'udah': 'udah', 'ordernya': 'order'}
------------------------
ID
R1          [jual, ramah, kirim, cepat, keren]
R2    [jual, harga, murah, murah, wajib, beli]
R3                [sebel, mending, cari, jual]
R4                 [jual, murah, harga, mahal]
R5                        [kirim, udah, order]
Name: data_tokens_stemmed, dtype: object


# TF

In [15]:

data["join"] = data.data_tokens_stemmed.apply(lambda x : " ".join(x))

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# banyaknya term yang akan digunakan, 
# di pilih berdasarkan top max_features 
# yang diurutkan berdasarkan term frequency seluruh corpus
max_features = 1000

# Feature Engineering 
print ("------- TF-IDF on Tweet data -------")

tf_idf = TfidfVectorizer(max_features=max_features, binary=True)
tfidf_mat = tf_idf.fit_transform(data["join"]).toarray()

print("TF-IDF ", type(tfidf_mat), tfidf_mat.shape)

------- TF-IDF on Tweet data -------
TF-IDF  <class 'numpy.ndarray'> (5, 15)


In [17]:
terms = tf_idf.get_feature_names()

# sum tfidf frequency of each term through documents
sums = tfidf_mat.sum(axis=0)

# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append((term, sums[col] ))

ranking = pd.DataFrame(data, columns=['term','rank'])
ranking.sort_values('rank', ascending=False)

Unnamed: 0,term,rank
4,jual,1.23638
3,harga,0.922598
9,murah,0.922598
6,kirim,0.900528
7,mahal,0.617893
10,order,0.614189
13,udah,0.614189
1,cari,0.549036
8,mending,0.549036
12,sebel,0.549036
