In [39]:
import pandas as pd
from spacy.lang.id import Indonesian
from spacy.lang.id.stop_words import STOP_WORDS

In [40]:
nlp = Indonesian()  # use directly

In [41]:
df = pd.read_csv("clean_jokowi.csv", encoding = "ISO-8859-1", names = ['tweet', 'sentiment'], na_values= ' ')
df.head()


Unnamed: 0,tweet,sentiment
,sentiment,label
0.0,PARAHNYA DIKASI TEPUK TANGAN LAGI BOS,0.0
1.0,Hai Berjumpa lagi dengan KontraS yang akan kem...,1.0
2.0,Bohong lagi bohong lagi Mikrofon dimatikan dul...,0.0
3.0,Prawobo sentil Jokowi Ugal2an Lalu Bung Adian ...,0.0


In [42]:
stopwords = list(STOP_WORDS)
stopwords

['itu',
 'menjadi',
 'semisalnya',
 'berakhir',
 'sekadarnya',
 'seketika',
 'aku',
 'lama',
 'ibarat',
 'masalah',
 'memperkirakan',
 'tentulah',
 'anda',
 'akhir',
 'tahun',
 'sebesar',
 'tepat',
 'begitulah',
 'nah',
 'jawaban',
 'lainnya',
 'panjang',
 'setinggi',
 'seringnya',
 'sama',
 'kemudian',
 'bekerja',
 'menegaskan',
 'mengingatkan',
 'jelas',
 'bersiap-siap',
 'gunakan',
 'tersebutlah',
 'haruslah',
 'boleh',
 'ataupun',
 'ditunjukkan',
 'ditanya',
 'dimisalkan',
 'seolah',
 'ternyata',
 'terdahulu',
 'kenapa',
 'masing-masing',
 'disini',
 'dulu',
 'berawal',
 'sebisanya',
 'bilakah',
 'memastikan',
 'sebaiknya',
 'sekiranya',
 'ada',
 'sayalah',
 'tanpa',
 'kamulah',
 'kamilah',
 'diminta',
 'betul',
 'tandasnya',
 'sendirinya',
 'menginginkan',
 'segala',
 'walaupun',
 'diinginkan',
 'bukan',
 'mengapa',
 'tersampaikan',
 'kembali',
 'perlukah',
 'sesudah',
 'bagai',
 'katanya',
 'wahai',
 'bagi',
 'sebuah',
 'jadi',
 'kitalah',
 'pula',
 'sesuatu',
 'kecil',
 'beginik

###### Getting Lemma and Stop words

In [43]:
docx = nlp("Budi membaca sebuah Buku")

In [44]:
# Lemmatizing of tokens
for word in docx:
    print(word.text,"Lemma =>",word.lemma_)

Budi Lemma => Budi
membaca Lemma => baca
sebuah Lemma => sebuah
Buku Lemma => Buku


In [45]:
# Filtering out Stopwords and Punctuations
for word in docx:
    if word.is_stop == False and not word.is_punct:
        if word.is_stop != True and not word.is_punct:
            print(word)

Budi
membaca
Buku


In [46]:
# Stop words and Punctuation In List Comprehension
[ word for word in docx if word.is_stop == False and not word.is_punct ]

[Budi, membaca, Buku]

In [47]:
# Use the punctuations of string module
import string
punctuations = string.punctuation

In [48]:
# Creating a Spacy Parser
parser = Indonesian()

In [49]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

#### Machine Learning With SKlearn

In [50]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection

  from numpy.core.umath_tests import inner1d


In [51]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [52]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 
classifier = LinearSVC()
nbClassifier = GaussianNB()
mNbClassifier = MultinomialNB()
svmClassifier = svm.SVC()

votingClassifier = VotingClassifier(estimators=[('LinearSVC', classifier),
                                                ('MultinomialNaiveBayes', mNbClassifier), 
                                                ('SVM', svmClassifier)])

In [53]:
# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [54]:
# Splitting Data Set
from sklearn.model_selection import train_test_split
# Features and Labels
X = df['tweet']
ylabels = df['sentiment']

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

In [56]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('to_dense', DenseTransformer()),
                 ('classifier', mNbClassifier)])

In [57]:
# Fit our data
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x1a189d9cc0>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngra...bject at 0x1a189d9a58>), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [58]:
# Predicting with a test dataset
sample_prediction = pipe.predict(X_test)

In [59]:
# Prediction Results
# 1 = Positive review
# 0 = Negative review
for (sample,pred) in zip(X_test,sample_prediction):
    print(sample,"Prediction=>",pred)

Untuk kesekian kali nya jubir jubir jokowi di buat tak berkutik dan menahan rasa malu dan Aktor Protagonis yg melakukan Prediction=> 0.0
2 9jokowilanjut Prediction=> 1.0
TetapIndonesia JokowiLagi Prediction=> 1.0
cek gt Gara gara Iklan Videotron Jokowi Amin Dikejar Syahroni Sidang dugaan pelangga Prediction=> 0.0
Ssstt Ssstt HATTRICK Prediction=> 0.0
Kpop bagus Via valen bagus Nela karisma bagus Jokowi bagus Prabowo bagus Toko bagus Yang jelek cuman tulisan tangan saya Prediction=> 1.0
Yok bisa yok Siaran Pers Peluncuran Laporan Evaluasi 4 Tahun Pemerintahan JK di sektor HAM oleh KontraS Jumat 9 Ok Prediction=> 1.0
Ini logika yg merusak pak karena nyalahin padahal maksudnya nenbak gubernur DKI sebelumnya kan Waspada Prediction=> 0.0
JokowiLagi Prediction=> 1.0
JokowiAmin jokowimarufamin Prediction=> 0.0
Saya menyampaikan selamat kepada sahabat saya Bapak Prabowo Subianto yang merayakan ulang tahun yang ke 67 pada hari ini Ter Prediction=> 0.0
Di Sumbawa saya saksikan bantuan pemerintah

In [60]:
# Accuracy
print("Accuracy: ",pipe.score(X_test,y_test))
print("Accuracy: ",pipe.score(X_test,sample_prediction))

Accuracy:  0.7441860465116279
Accuracy:  1.0


In [61]:
# Accuracy
print("Accuracy: ",pipe.score(X_train,y_train))

Accuracy:  0.9883720930232558


In [63]:
# Another random review
# pipe.predict(["jokowi Siapa pun Cawapres intinya Jokowi harus 2 periode"])
pipe.predict(["Memilih Jokowi sebagai pemimpin dengan alasan tak ingin Prabowo berkuasa adalah hal paling menyedihkan yg pernah gw dengar"])

array(['0.0'], dtype='<U5')

In [97]:
import csv
import random
def loadCsv(filename):
    lines = csv.reader(open(filename, "r"))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset
 
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [76]:
import math
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

In [77]:
import math
def mean(numbers):
    return sum(numbers)/float(len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

numbers = [1,2,3,4,5]
print(f"Summary of {numbers}: mean={mean(numbers)}, stdev={stdev(numbers)}")

Summary of [1, 2, 3, 4, 5]: mean=3.0, stdev=1.5811388300841898


In [78]:
def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [79]:
dataset = [[1,20,0], [2,21,1], [3,22,0]]
summary = summarize(dataset)
print(f'Attribute summaries: {summary}')

Attribute summaries: [(2.0, 1.0), (21.0, 1.0)]


In [80]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

In [81]:
dataset = [[1,20,1], [2,21,0], [3,22,1], [4,22,0]]
summary = summarizeByClass(dataset)
print(f'Summary by class value: {summary}')

Summary by class value: {1: [(2.0, 1.4142135623730951), (21.0, 1.4142135623730951)], 0: [(3.0, 1.4142135623730951), (21.5, 0.7071067811865476)]}


In [82]:
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

In [83]:
x = 71.5
mean = 73
stdev = 6.2
probability = calculateProbability(x, mean, stdev)
print(f'Probability of belonging to this class: {probability}')

Probability of belonging to this class: 0.06248965759370005


In [84]:
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

In [85]:
summaries = {0:[(1, 0.5)], 1:[(20, 5.0)]}
inputVector = [1.1, '?']
probabilities = calculateClassProbabilities(summaries, inputVector)
print(f'Probabilities for each class: {probabilities}')

Probabilities for each class: {0: 0.7820853879509118, 1: 6.298736258150442e-05}


In [86]:
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [87]:
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
inputVector = [1.1, '?']
result = predict(summaries, inputVector)
print(f'Prediction: {result}')

Prediction: A


In [88]:
def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [89]:
summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
testSet = [[1.1, '?'], [19.1, '?']]
predictions = getPredictions(summaries, testSet)
print(f'Predictions: {predictions}')

Predictions: ['A', 'B']


In [90]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [91]:
testSet = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]
predictions = ['a', 'a', 'a']
accuracy = getAccuracy(testSet, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 66.66666666666666


In [92]:
tfidfResult = vectorizer.fit_transform(df.tweet)
tfCsv = pd.DataFrame(tfidfResult.toarray())
# dense = tfidfResult.todense()

tfCsv

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,869,870,871,872,873,874,875,876,877,878
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
