### Import Libraries

In [1]:
import nltk
import sqlite3
import pandas as pd
from gensim.models import Word2Vec
import time
from datetime import timedelta

### Read database

In [2]:
# Create database connection.
cnx = sqlite3.connect('DBnews.db')

t_comments = pd.read_sql_query("SELECT * FROM t_comments", cnx)

In [3]:
t_comments

Unnamed: 0,comment_id,comment,baseline,correct_comment,comment_ptr
0,4207096,"WKWKWK, GAK SEKALIAN PAK, SELAWAT KE DPR... KA...",-1,,1
1,4207096,Mantab.. ini br dakwah yg sejati.. kl d tempat...,1,,2
2,4207096,Salut buat Gus Miftah,1,,3
3,4207096,yg nyinyir gak pernah lihat dan baca atw menge...,1,,4
4,4207096,Ada adabnya sholawat. Lebih baik ajak ke majel...,1,,5
...,...,...,...,...,...
598092,4416660,212 nggak pernah ngadain acara beginian ya.......,,,598093
598093,4416660,TBG... Sayank.. salah pilih kendraaan..�,,,598094
598094,4416660,Saya bersama TGB pilih Jokowi�,,,598095
598095,4416660,"Hijrah dari hoax ke non hoax, dari ngawur ke b...",,,598096


### Case Folding

In [4]:
t_comments['comment'] = t_comments['comment'].str.lower()

In [5]:
t_comments

Unnamed: 0,comment_id,comment,baseline,correct_comment,comment_ptr
0,4207096,"wkwkwk, gak sekalian pak, selawat ke dpr... ka...",-1,,1
1,4207096,mantab.. ini br dakwah yg sejati.. kl d tempat...,1,,2
2,4207096,salut buat gus miftah,1,,3
3,4207096,yg nyinyir gak pernah lihat dan baca atw menge...,1,,4
4,4207096,ada adabnya sholawat. lebih baik ajak ke majel...,1,,5
...,...,...,...,...,...
598092,4416660,212 nggak pernah ngadain acara beginian ya.......,,,598093
598093,4416660,tbg... sayank.. salah pilih kendraaan..�,,,598094
598094,4416660,saya bersama tgb pilih jokowi�,,,598095
598095,4416660,"hijrah dari hoax ke non hoax, dari ngawur ke b...",,,598096


### Punctuation Removal

In [6]:
t_comments['comment'] = t_comments['comment'].str.replace('[^\w\s]','')

In [7]:
t_comments

Unnamed: 0,comment_id,comment,baseline,correct_comment,comment_ptr
0,4207096,wkwkwk gak sekalian pak selawat ke dpr kan ban...,-1,,1
1,4207096,mantab ini br dakwah yg sejati kl d tempat umu...,1,,2
2,4207096,salut buat gus miftah,1,,3
3,4207096,yg nyinyir gak pernah lihat dan baca atw menge...,1,,4
4,4207096,ada adabnya sholawat lebih baik ajak ke majeli...,1,,5
...,...,...,...,...,...
598092,4416660,212 nggak pernah ngadain acara beginian yateru...,,,598093
598093,4416660,tbg sayank salah pilih kendraaan,,,598094
598094,4416660,saya bersama tgb pilih jokowi,,,598095
598095,4416660,hijrah dari hoax ke non hoax dari ngawur ke be...,,,598096


### Tokenizing

In [8]:
t_comments['comment'] = t_comments.apply(lambda row: nltk.word_tokenize(row['comment']), axis=1)

In [9]:
t_comments

Unnamed: 0,comment_id,comment,baseline,correct_comment,comment_ptr
0,4207096,"[wkwkwk, gak, sekalian, pak, selawat, ke, dpr,...",-1,,1
1,4207096,"[mantab, ini, br, dakwah, yg, sejati, kl, d, t...",1,,2
2,4207096,"[salut, buat, gus, miftah]",1,,3
3,4207096,"[yg, nyinyir, gak, pernah, lihat, dan, baca, a...",1,,4
4,4207096,"[ada, adabnya, sholawat, lebih, baik, ajak, ke...",1,,5
...,...,...,...,...,...
598092,4416660,"[212, nggak, pernah, ngadain, acara, beginian,...",,,598093
598093,4416660,"[tbg, sayank, salah, pilih, kendraaan]",,,598094
598094,4416660,"[saya, bersama, tgb, pilih, jokowi]",,,598095
598095,4416660,"[hijrah, dari, hoax, ke, non, hoax, dari, ngaw...",,,598096


In [10]:
# get 10000 comments
t_comments = t_comments[:1000]

# get tokenized comment
tok_comments = t_comments['comment']

### Preprocessing the Corpus

In [11]:
import csv

korpus = []

with open("korpus.csv", encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=',') # change contents to floats
    for row in reader: # each row is a list
        korpus.append(row)

In [12]:
korpus

[['Persija',
  'Jakarta',
  'siap',
  'tempur',
  'menghadapi',
  'Selangor',
  'FA',
  'dalam',
  'laga',
  'ujicoba',
  '.'],
 ['Persija',
  'ingin',
  'kembali',
  'ke',
  'bentuk',
  'terbaik',
  'sebelum',
  'Liga',
  '1',
  'kembali',
  'digulirkan',
  '.'],
 ['Persija',
  'akan',
  'melawan',
  'Selangor',
  'FA',
  'di',
  'Stadion',
  'Patriot',
  'Candrabhaga',
  'malam',
  '.'],
 ['Bagi',
  'Selangor',
  ',',
  'ini',
  'adalah',
  'ujicoba',
  'keduanya',
  'di',
  'Indonesia',
  'setelah',
  'sebelumnya',
  'menjajal',
  'Madura',
  'United',
  '.'],
 ['Pada',
  'laga',
  'ujicoba',
  'di',
  'Stadion',
  'Gelora',
  'Ratu',
  'Pamelingan',
  ',',
  'Evan',
  'Dimas',
  'dkk',
  'kalah',
  'telak',
  'dengan',
  'skor',
  '1-4',
  'dari',
  'Madura',
  'United',
  '.'],
 ['Namun',
  'yang',
  'jelas',
  ',',
  'Persija',
  'mesti',
  'memperlihatkan',
  'performa',
  'apik',
  'menjelang',
  'laga',
  'melawan',
  'Borneo',
  'pada',
  '12',
  'September',
  '.'],
 ['Kami'

### Train the corpus

In [13]:
print('Training Word2Vec Model...')
start_time = time.time()

wv = Word2Vec(sentences=korpus, size=100, alpha=0.025, window=5, min_count=5,
                 max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
                 sg=0, hs=0, negative=5, cbow_mean=1)
wv.save('word2vec.model')

finish_time = time.time()
print('Finished. Elapsed time: {}'.format(timedelta(seconds=finish_time-start_time)))

Training Word2Vec Model...
Finished. Elapsed time: 0:00:27.856682


In [14]:
# most similar to 'yg' based on corpus
wv.wv.most_similar("yg")

[('anda', 0.6072468757629395),
 ('kalian', 0.541106104850769),
 ('kamu', 0.5317510962486267),
 ('Harus', 0.5300275087356567),
 ('Yang', 0.5289517045021057),
 ('baiknya', 0.526628851890564),
 ('yang', 0.521325945854187),
 ('tdk', 0.5197136998176575),
 ('Semuanya', 0.5016289949417114),
 ('toh', 0.4994879961013794)]

In [15]:
# most similar to 'gak' based on corpus
wv.wv.most_similar("gak")

[('enggak', 0.8394542336463928),
 ('ndak', 0.8165795207023621),
 ('ga', 0.7791200876235962),
 ('Nggak', 0.7657572031021118),
 ('emang', 0.7528153657913208),
 ('nggak', 0.7525509595870972),
 ('tuh', 0.7350001335144043),
 ('kulitnya', 0.7234979867935181),
 ('banget', 0.7208560109138489),
 ('gitu', 0.7177690267562866)]

### Slang corpus

In [16]:
korpus_slang = pd.read_csv("colloquial-indonesian-lexicon.csv")

In [17]:
korpus_slang

Unnamed: 0,slang,formal,In-dictionary,context,category1,category2,category3
0,woww,wow,1,wow,elongasi,0,0
1,aminn,amin,1,Selamat ulang tahun kakak tulus semoga panjang...,elongasi,0,0
2,met,selamat,1,Met hari netaas kak!? Wish you all the best @t...,abreviasi,0,0
3,netaas,menetas,1,Met hari netaas kak!? Wish you all the best @t...,afiksasi,elongasi,0
4,keberpa,keberapa,0,Birthday yg keberpa kak?,abreviasi,0,0
...,...,...,...,...,...,...,...
15001,gataunya,enggak taunya,0,Ini kaya nenek2 ya beb gataunya agnezz @yugime...,akronim,0,0
15002,gtau,enggak tau,0,Stidaknya mrka may berkarya Dan berusaha yg tr...,akronim,abreviasi,0
15003,gatau,enggak tau,0,Ih gatau malu,akronim,0,0
15004,fans2,fan-fan,0,Jkt48 adalah tempat di mana sesama fans saling...,reduplikasi,naturalisasi,0


In [18]:
#Filter data slang
korpus_slang = korpus_slang[korpus_slang.columns[0:2]]

In [19]:
korpus_slang

Unnamed: 0,slang,formal
0,woww,wow
1,aminn,amin
2,met,selamat
3,netaas,menetas
4,keberpa,keberapa
...,...,...
15001,gataunya,enggak taunya
15002,gtau,enggak tau
15003,gatau,enggak tau
15004,fans2,fan-fan


In [20]:

#Menggunakan perbandingan dengan word2vec yang ada dan edit distance untuk membuat kata menjadi kata formal
print("Processing data...")
start_time = time.time()

corr_comments = []
slang_words = []
reco_words = []
corr_word = 0
tot_word = 0
for comment in tok_comments:
    corr_comment = []
    slang_word = []
    for sub_comment in comment:
        if sub_comment in wv.wv.vocab:
            sim_slang = korpus_slang[korpus_slang['slang'] == sub_comment]
            if len(sim_slang) != 0:
                mode_word = sim_slang.formal.mode()[0]
                corr_word = corr_word + 1
                corr_comment.append(mode_word)
            else:
                corr_word = corr_word + 1
                corr_comment.append(sub_comment)
        else:
            comment_found = False
            sim_slang = korpus_slang[korpus_slang['slang'] == sub_comment]
            if len(sim_slang) != 0:
                mode_word = sim_slang.formal.mode()[0]
                corr_word = corr_word + 1
                corr_comment.append(mode_word)
                comment_found = True
            else:
                rat_words = []
                for slang in korpus_slang['slang']:
                    rat_word = nltk.edit_distance(sub_comment, slang)
                    rat_words.append(rat_word)
                min_rat = rat_words.index(min(rat_words))
                value = str(korpus_slang.formal.loc[min_rat])
                corr_words = corr_word + 1
                corr_comment.append(value)
                slang_words.append(sub_comment + " = " + value)
                comment_found = True
            if comment_found ==  False:
                corr_comment.append(sub_comment)
                slang_words.append(sub_comment + " = " + "Not Found")
        tot_word = tot_word + 1
    corr_comments.append(corr_comment)
print("Total kata yang diperbaiki : " + str(corr_word) + " dari total semua kata : " + str(tot_word))
finish_time = time.time()
print('Elapsed time: {}'.format(timedelta(seconds=finish_time-start_time)))

Processing data...
Total kata yang diperbaiki : 13855 dari total semua kata : 15675
Elapsed time: 0:12:54.299179


In [21]:
corr_comments

[['wow',
  'enggak',
  'sekalian',
  'pak',
  'selawat',
  'ke',
  'dari',
  'kan',
  'banyak',
  'koruptor',
  'disana'],
 ['mantab',
  'ini',
  'baru',
  'dakwah',
  'yang',
  'sejati',
  'kalo',
  'di',
  'tempat',
  'umum',
  'mah',
  'dah',
  'biasa',
  'ini',
  'luar',
  'biasa'],
 ['salut', 'buat', 'gus', 'meminta'],
 ['yang',
  'nyinyir',
  'enggak',
  'pernah',
  'lihat',
  'dan',
  'baca',
  'atau',
  'mengetahui',
  'sejarah',
  'dakwah',
  'maling',
  'ulang',
  'kali',
  'bagaimana',
  'ada',
  'cara',
  'demi',
  'kebaikan',
  'asal',
  'jangan',
  'saja',
  'dengan',
  'cara',
  'berzina',
  'lakukan',
  'dengan',
  'porsi',
  'dan',
  'semakin',
  'yang',
  'tepat'],
 ['ada',
  'apa-apanya',
  'sholawat',
  'lebih',
  'baik',
  'ajak',
  'ke',
  'majelis',
  'ilmu'],
 ['setiap',
  'ulama',
  'punya',
  'jalan',
  'dan',
  'cara',
  'dakwah',
  'masing-masing',
  'jika',
  'kamu',
  'melihat',
  'terbuka',
  'lawan',
  'dengan',
  'ambekan',
  'jika',
  'tidak',
  'mampu

In [22]:
slang_words

['wkwkwk = wow',
 'dpr = dari',
 'miftah = meminta',
 'walisongo = maling',
 'sunan = ulang',
 'jagasmua = bagaimana',
 'ajk = saja',
 'sikon = semakin',
 'adabnya = apa-apanya',
 'kemunkaran = terbuka',
 'tanganmukekuatanmu = ambekan',
 'mulutmu = mulut-mulut',
 'hatimu = hati-hati',
 'selemah = salihah',
 'disisipin = disini',
 'umpanin = menyarankan',
 'nocannya = menontonnya',
 'miftah = meminta',
 'fpi = tapi',
 'hti = hati',
 'beneranditerima = benaran',
 'kalangankl = jangan',
 'uas = kak',
 'guslanjutkaninilah = lanjutkan',
 'jadunya = jaman dulunya',
 'wkwkwkmantap = mantap',
 'swt = buat',
 'mantaaappppbuat = mantap',
 'dpr = dari',
 'josshhh = loh',
 'gratisan = gratis',
 'nope = apa',
 'batas2 = bahas-bahas',
 'kereennnnnnn = keren',
 'snagat = ingat',
 'katimbang = kembang',
 'sebenar = sekadar',
 'gusjustru = justru',
 'bedebah = benaran',
 'ahok = hendak',
 'subhanallah = subhanallah',
 'swt = buat',
 'jenengan = senangi',
 'salutterharu = terharu',
 'cacianluar = carany