In [11]:
import pandas as pd
import numpy as np
import nltk
import Sastrawi
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from nltk.tag import CRFTagger
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [12]:
def normalisasi(tweet):
    normal_tw = tweet.lower() #lowercase
    normal_tw = re.sub('\s+', ' ', normal_tw) # remove extra space
    normal_tw = re.sub("@[^\s]*", "", normal_tw)
    normal_tw = re.sub('[0-9]*[+-:]*[0-9]+', '', normal_tw)
    normal_tw = normal_tw.strip() #trim depan belakang normal_tw = re.sub(r'[^\w\s]','',normal_tw) #buang punctuation
    normal_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE) #regex huruf yang berulang kaya haiiii (untuk fitur unigram) normal_tw = normal_regex.sub(r"\1\1", normal_tw) #buang huruf yang berulang
    return normal_tw

In [13]:
def remove_stopwords(tweet):
    stopwords = pd.read_csv("dataset/stopwords.csv")
    special_list = ['username', 'url', 'sensitive-no']
    token = nltk.word_tokenize(tweet)
    token_afterremoval = []
    for k in token:
        if k not in stopwords and k not in special_list: 
            token_afterremoval.append(k)
    
    str_clean = ' '.join(token_afterremoval)
    return str_clean

In [14]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def Stemming(tweet):
    token = nltk.word_tokenize(tweet)
    stem_kalimat = []
    for k in token:
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        stem_kata = stemmer.stem(k.encode('ascii', 'ignore'))
        stem_kalimat.append(stem_kata)
    stem_kalimat_str = ' '.join(stem_kalimat)
    return stem_kalimat_str

In [15]:
def pra_pemrosesan(list_tweet):
    tweet_clean = []
    for tw in list_tweet:
        normal_tweet = normalisasi(tw)
        nosw_tweet = remove_stopwords(normal_tweet)
#         stem_tweet = Stemming(nosw_tweet)
#         tweet_clean.append(stem_tweet)
        tweet_clean.append(nosw_tweet)
    return tweet_clean

In [8]:
def EkstraksiBoW(tweet):
    unigram = CountVectorizer(ngram_range=(1,1), max_features=2000)
    unigram_matrix = unigram.fit_transform(np.array(tweet)).todense()
    nama_fitur = unigram.get_feature_names()
    return unigram_matrix, nama_fitur

In [16]:
raw_data = pd.read_csv("dataset/train_set.csv", delimiter=",", encoding="Latin-1")
raw_data.head()

Unnamed: 0,id,sentimen,tweet
0,1,1,oks kak semangat ya kalian kalian
1,2,0,sekarang harus kaya orang bodoh lagi bodoh sangat
2,3,1,"Begitu diumumkan lulus 100%, mereka semua suju..."
3,4,0,[USERNAME] [USERNAME] Katanya Bapak Reformasi ...
4,5,0,macet macetan perut kosong akhirnya mampir dah...


In [17]:
raw_tweet = raw_data['tweet']
clean_tweet = pra_pemrosesan(raw_tweet)
clean_tweet

label = raw_data['sentimen'].tolist()

In [31]:
unigram_feat, feat_name = EkstraksiBoW(clean_tweet)
print(unigram_feat)
print(feat_name)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['_â', 'aamiin', 'abal', 'abang', 'abis', 'acara', 'aceh', 'ad', 'ada', 'adalah', 'adanya', 'adek', 'adem', 'adik', 'adil', 'admin', 'aduh', 'ae', 'agak', 'agama', 'agar', 'agung', 'ah', 'ahok', 'ahsan', 'aib', 'aing', 'air', 'aj', 'aja', 'ajah', 'ajak', 'ajar', 'aje', 'akan', 'akhir', 'akhirat', 'akhirnya', 'akrab', 'aktif', 'aktivis', 'aku', 'akun', 'akunya', 'al', 'ala', 'alah', 'alam', 'alamat', 'alasan', 'alay', 'album', 'alhamdulilah', 'alhamdulillah', 'ali', 'alias', 'all', 'allah', 'alloh', 'always', 'ama', 'aman', 'amanah', 'amat', 'ambil', 'amin', 'amit', 'amp', 'ampe', 'ampun', 'an', 'anak', 'anaknya', 'and', 'anda', 'aneh', 'anggap', 'anggota', 'anjing', 'anjir', 'antara', 'anti', 'antri', 'ap', 'apa', 'apaan', 'apakah', 'apalagi', 'apan', 'apapun', 'api', 'aplikasi', 'aq', 'arab', 'arah', 'are', 'army', 'arti', 'artinya', 'artis', 'asal', 'asik', 'asing',

In [18]:
raw_data_tester = pd.read_csv("dataset/test_set.csv", delimiter=",", encoding="Latin-1")
raw_data_tester.head()

Unnamed: 0,test_ID,tweet
0,0,Jadi wanita jangan suka menghancurkan hubungan...
1,1,sombong apanya kalau sms saja dibls terus
2,2,apadah kamu :p cie cie baik kamu cie bebe cie ...
3,3,tdrlah besok medical check up semoga lancar â?...
4,4,crew serbu bsm seru (at bank syariah mandiri b...


In [19]:
raw_tweet_tester = raw_data_tester['tweet']
clean_tweet_tester = pra_pemrosesan(raw_tweet_tester)
clean_tweet_tester

['jadi wanita jangan suka menghancurkan hubungan orang . jgn bangga berhasil merusak kebahagian orang . silahkan saja , tapi ga berkah bahagianya nanti hehe .',
 'sombong apanya kalau sms saja dibls terus',
 'apadah kamu : p cie cie baik kamu cie bebe cie kiwkiw ; )',
 'tdrlah besok medical check up semoga lancar â ? º wml',
 'crew serbu bsm seru ( at bank syariah mandiri bekasi ) [ pic ] â ? ?',
 "sian amat ditelantarin : '| tapi semua akan indah pada waktunya nge : ' ) loh",
 'dan dirikanlah sembahyang tunaikanlah zakat dan tatlah kepada rasul supaya kamu diberi rahmat ( )',
 'kamu pikir saya bandar bayarin makan ! evil dead pokoknya star trek keren tapi',
 'tidak-kreatif ambil kutipan orang wkwk kalau suka ya ungkapin saja dari pada ditikung ama',
 'ya jangan dibahas di twiter juga kali ven teman saya tuh wkwk yap buktinya dia masih sayang sama mantanya han',
 'oke beb later on',
 'jln jatibaru , bagian dari wilayah tn abang.pengaturan wilayah tgg jwb dan wwnang gub.tng abng soal ru

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Create a vectorizer to convert tweets to a numeric matrix of TF-IDF features
vectorizer = TfidfVectorizer(max_features = 6600)

# Create and train a Classifier using the train set
clf = LogisticRegression()
model = clf.fit(vectorizer.fit_transform(clean_tweet), label)

# Predictions on the test set
predictions = clf.predict(vectorizer.transform(clean_tweet_tester))

id_target = raw_data_tester['test_ID']
df_out = pd.DataFrame({'id':id_target, 'sentimen':predictions})
df_out.to_csv('submission_tfidf.csv', index=False, header=False)

submission = pd.read_csv("submission_tfidf.csv", delimiter=",", encoding="Latin-1")
submission

Unnamed: 0,0,0.1
0,1,0
1,2,1
2,3,1
3,4,1
4,5,1
5,6,1
6,7,1
7,8,0
8,9,1
9,10,1


In [66]:
from sklearn.naive_bayes import MultinomialNB

vectorizer = TfidfVectorizer(max_features = 5500)

clf = MultinomialNB()
model = clf.fit(vectorizer.fit_transform(clean_tweet), label)

# predictions = clf.predict(clean_tweet_tester)
predictions = clf.predict(vectorizer.transform(clean_tweet_tester))
print(predictions)

id_target = raw_data_tester['test_ID']
df_out = pd.DataFrame({'id':id_target, 'sentimen':predictions})
df_out.to_csv('submission_multinomial_unigram.csv', index=False, header=False)

submission = pd.read_csv("submission_multinomial_unigram.csv", delimiter=",", encoding="Latin-1")
submission

[1 0 1 ... 0 1 0]


Unnamed: 0,0,1
0,1,0
1,2,1
2,3,1
3,4,1
4,5,1
5,6,1
6,7,1
7,8,0
8,9,1
9,10,1


In [70]:
from sklearn.naive_bayes import MultinomialNB

vectorizer = TfidfVectorizer(max_features = 5500)

clf = MultinomialNB()
model =clf.fit(vectorizer.fit_transform(clean_tweet), label)

predictions = model.predict(vectorizer.transform(clean_tweet_tester))

print(predictions)

id_target = raw_data_tester['test_ID']
df_out = pd.DataFrame({'id':id_target, 'sentimen':predictions})
df_out.to_csv('submission_tfidf_multinomial.csv', index=False, header=False)

submission = pd.read_csv("submission_tfidf_multinomial.csv", delimiter=",", encoding="Latin-1")
submission

[1 0 1 ... 0 1 0]


Unnamed: 0,0,1
0,1,0
1,2,1
2,3,1
3,4,1
4,5,1
5,6,1
6,7,1
7,8,0
8,9,1
9,10,1


In [98]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

vectorizer = TfidfVectorizer(max_features = 6600)

SVM = svm.SVC(C=1.0, kernel='linear', degree=4)
model = SVM.fit(vectorizer.fit_transform(clean_tweet), label)

predictions = model.predict(vectorizer.transform(clean_tweet_tester))
print(predictions)

id_target = raw_data_tester['test_ID']
df_out = pd.DataFrame({'id':id_target, 'sentimen':predictions})
df_out.to_csv('submission_tfidf_svm.csv', index=False, header=False)

submission = pd.read_csv("submission_tfidf_svm.csv", delimiter=",", encoding="Latin-1")
submission

[0 0 1 ... 0 1 0]


Unnamed: 0,0,0.1
0,1,0
1,2,1
2,3,1
3,4,1
4,5,1
5,6,1
6,7,1
7,8,0
8,9,1
9,10,1


In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

cv = CountVectorizer(ngram_range=(1,1), max_features=6000)
unigram = cv.fit(np.array(clean_tweet))

matrix_train = unigram.transform(clean_tweet)
matrix_tester = unigram.transform(clean_tweet_tester)

SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
model = SVM.fit(matrix_train, label)

predictions = model.predict(matrix_tester)
print(predictions)

id_target = raw_data_tester['test_ID']
df_out = pd.DataFrame({'id':id_target, 'sentimen':predictions})
df_out.to_csv('submission_unigram_svm.csv', index=False, header=False)

submission = pd.read_csv("submission_unigram_svm.csv", delimiter=",", encoding="Latin-1")
submission

[1 0 1 ... 0 1 0]


Unnamed: 0,0,1
0,1,0
1,2,1
2,3,1
3,4,1
4,5,1
5,6,1
6,7,1
7,8,0
8,9,1
9,10,1
