In [1]:
import pandas as pd
import numpy as np
import re
from string import punctuation
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder

#### Mempersiapkan Dataset

In [2]:
dataset = pd.read_csv('train_preprocess.tsv.txt', sep="\t",header=None,names=["text","label"])
dataset

Unnamed: 0,text,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative
...,...,...
10995,tidak kecewa,positive
10996,enak rasa masakan nya apalagi kepiting yang me...,positive
10997,hormati partai-partai yang telah berkoalisi,neutral
10998,"pagi pagi di tol pasteur sudah macet parah , b...",negative


In [3]:
dataset.isna().sum()#aman jaya

text     0
label    0
dtype: int64

#### Normalisasi Text

In [4]:
def lowercasing(paragraph):
    return paragraph.lower()

def menghilangkan_tandabaca(paragraph):
    new_paragraph = re.sub(fr'[{punctuation}]', r'', paragraph)
    return new_paragraph

In [5]:
def text_normalization(paragraph):
    paragraph = lowercasing(paragraph)
    paragraph = menghilangkan_tandabaca(paragraph)
    paragraph = re.sub(r"[ ]+",r' ',paragraph)
    return paragraph

In [49]:
train_data,test_data=train_test_split(dataset)

In [50]:
train_data['text'] = train_data['text'].apply(lambda x: text_normalization(x))
test_data['text'] = test_data['text'].apply(lambda x: text_normalization(x))

#### Feature Extraction

In [8]:
# max_features = 100000
tokenizer = Tokenizer(oov_token='<UNK>')

In [9]:
tokenizer.fit_on_texts(train_data['text'])

In [10]:
train_data_tf = tokenizer.texts_to_sequences(train_data['text'])
test_data_tf = tokenizer.texts_to_sequences(test_data['text'])

In [51]:
print(len(train_data_tf))
print(len(test_data_tf))

8800
2200


In [12]:
# train_padded = pad_sequences(sequences=train_data_tf,padding='post')
train_padded = pad_sequences(sequences=train_data_tf,padding='post')
max_len = train_padded.shape[1]
test_padded = pad_sequences(sequences=test_data_tf,padding='post',maxlen=train_padded.shape[1])

In [13]:
onehot = OneHotEncoder()

In [14]:
labels = onehot.fit_transform(train_data[['label']])

#### Prepare Train & Test Dataset

#### Training Model

In [15]:
from tensorflow.keras.models import Sequential
model = Sequential()

In [16]:
max_features = len(tokenizer.index_word)
batch_size=64
output_dim = 64
labels_tmp=32
input_len =train_padded.shape[1]

In [17]:
model = Sequential()
model.add(layers.Embedding(input_dim=max_features,output_dim=output_dim, input_length=input_len))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'],)

In [19]:
max_index_train=int(np.floor(train_padded.shape[0]/batch_size)*batch_size) #ambil data yang sesuai dengan bacth

In [20]:
model.fit(x=train_padded[:max_index_train], 
          y=labels.toarray()[:max_index_train],
          batch_size=batch_size, 
          epochs=3, 
          shuffle=True,validation_split=0.15)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x136fdf44cd0>

#### Model Testing and Evaluation

In [31]:
from sklearn.metrics import classification_report

In [33]:
test_padded.shape

(2750, 91)

In [52]:
prediction=model.predict(test_padded,batch_size=64)
prediction



array([[0.28021032, 0.05269283, 0.66709685],
       [0.25682852, 0.0469746 , 0.69619685],
       [0.28109404, 0.05517664, 0.6637293 ],
       ...,
       [0.39716232, 0.09381261, 0.5090251 ],
       [0.34028855, 0.07192573, 0.5877858 ],
       [0.4241759 , 0.10377859, 0.47204554]], dtype=float32)

In [53]:
len(onehot.inverse_transform(prediction))

2750

In [54]:
test_data[["label"]]

Unnamed: 0,label
6803,negative
1528,positive
2344,positive
6201,positive
1910,neutral
...,...
9188,positive
3050,negative
3558,positive
9832,positive


In [56]:
print(classification_report(y_true=test_data[["label"]],y_pred=onehot.inverse_transform(prediction)))

              precision    recall  f1-score   support

    negative       0.28      0.11      0.16       840
     neutral       0.00      0.00      0.00       307
    positive       0.58      0.87      0.69      1603

    accuracy                           0.54      2750
   macro avg       0.29      0.33      0.28      2750
weighted avg       0.42      0.54      0.45      2750



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### folding

In [57]:
#semua berjalan dengan lanca maka tinggal copas untuk kfold
from sklearn.model_selection import KFold

In [58]:
kfold=KFold(random_state=0,shuffle=True)
for train,test in kfold.split(dataset):
    train_data=dataset.loc[train]
    test_data=dataset.loc[test]
    onehot=OneHotEncoder()
    label=onehot.fit_transform(train_data[["label"]])
    
    #data udah ada
    train_data['text'] = train_data['text'].apply(lambda x: text_normalization(x))
    test_data['text'] = test_data['text'].apply(lambda x: text_normalization(x))
    
    tokenizer=Tokenizer(oov_token="UNK")
    tokenizer.fit_on_texts(train_data["text"])
    train_data_tf=tokenizer.texts_to_sequences(train_data["text"])
    test_data_tf=tokenizer.texts_to_sequences(test_data["text"])
    
    train_data_pad=pad_sequences(train_data_tf,padding="post")
    test_data_pad=pad_sequences(test_data_tf,padding="post",maxlen=train_data_pad.shape[1])
    
    model=Sequential()
    model.add(layers.Embedding(len(tokenizer.index_word),64,input_length=train_data_pad.shape[1]))
    model.add(layers.Conv1D(128,5,activation="relu"))
    model.add(layers.GlobalMaxPool1D())
    model.add(layers.Dense(10,activation="relu"))
    model.add(layers.Dense(3,activation="softmax"))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'],)
    
    batch_size=64
    max_index_train=int(np.floor(train_data_pad.shape[0]/batch_size)*batch_size)
    model.fit(x=train_data_pad[:max_index_train],y=label.toarray()[:max_index_train],batch_size=64,epochs=3,shuffle=True)
    prediction=model.predict(test_data_pad,batch_size=64)
    prediction=onehot.inverse_transform(prediction)
    print(classification_report(y_true=test_data[["label"]],y_pred=prediction))

Epoch 1/3
Epoch 2/3
Epoch 3/3
              precision    recall  f1-score   support

    negative       0.83      0.82      0.83       688
     neutral       0.82      0.73      0.77       240
    positive       0.90      0.93      0.92      1272

    accuracy                           0.87      2200
   macro avg       0.85      0.83      0.84      2200
weighted avg       0.87      0.87      0.87      2200

Epoch 1/3
Epoch 2/3
Epoch 3/3
              precision    recall  f1-score   support

    negative       0.81      0.84      0.82       690
     neutral       0.81      0.77      0.79       236
    positive       0.92      0.91      0.92      1274

    accuracy                           0.88      2200
   macro avg       0.85      0.84      0.84      2200
weighted avg       0.88      0.88      0.88      2200

Epoch 1/3
Epoch 2/3
Epoch 3/3
              precision    recall  f1-score   support

    negative       0.81      0.87      0.84       720
     neutral       0.66      0.65      

In [59]:
print(classification_report(y_true=test_data[["label"]],y_pred=prediction))

              precision    recall  f1-score   support

    negative       0.82      0.85      0.83       660
     neutral       0.83      0.77      0.80       202
    positive       0.93      0.92      0.93      1338

    accuracy                           0.89      2200
   macro avg       0.86      0.85      0.85      2200
weighted avg       0.89      0.89      0.89      2200



In [65]:
class_report=classification_report(y_true=test_data[["label"]],y_pred=prediction,output_dict=True)

In [67]:
class_report.im

{'negative': {'precision': 0.8165938864628821,
  'recall': 0.85,
  'f1-score': 0.8329621380846326,
  'support': 660},
 'neutral': {'precision': 0.8297872340425532,
  'recall': 0.7722772277227723,
  'f1-score': 0.7999999999999999,
  'support': 202},
 'positive': {'precision': 0.929811320754717,
  'recall': 0.9207772795216741,
  'f1-score': 0.9252722493428465,
  'support': 1338},
 'accuracy': 0.8859090909090909,
 'macro avg': {'precision': 0.8587308137533841,
  'recall': 0.8476848357481487,
  'f1-score': 0.8527447958091597,
  'support': 2200},
 'weighted avg': {'precision': 0.8866620606872315,
  'recall': 0.8859090909090909,
  'f1-score': 0.8860769457984483,
  'support': 2200}}

In [None]:
formatted_report = {
    'precision': {},
    'recall': {},
    'f1-score': {},
    'support': {}
}

for class_label, metrics in class_report.items():
    if class_label in ['accuracy', 'macro avg', 'weighted avg']:
        continue
    formatted_report['precision'][class_label] = metrics['precision']
    formatted_report['recall'][class_label] = metrics['recall']
    formatted_report['f1-score'][class_label] = metrics['f1-score']
    formatted_report['support'][class_label] = metrics['support']

# Masukkan metrik keseluruhan
overall_metrics = class_report['macro avg']
formatted_report['precision']['macro avg'] = overall_metrics['precision']
formatted_report['recall']['macro avg'] = overall_metrics['recall']
formatted_report['f1-score']['macro avg'] = overall_metrics['f1-score']
formatted_report['support']['macro avg'] = overall_metrics['support']

# Masukkan metrik weighted average
weighted_metrics = class_report['weighted avg']
formatted_report['precision']['weighted avg'] = weighted_metrics['precision']
formatted_report['recall']['weighted avg'] = weighted_metrics['recall']
formatted_report['f1-score']['weighted avg'] = weighted_metrics['f1-score']
formatted_report['support']['weighted avg'] = weighted_metrics['support']

# Tambahkan accuracy
formatted_report['accuracy'] = class_report['accuracy']

# Ubah ke dalam format JSON
import json
formatted_report_json = json.dumps(formatted_report, indent=4)

# Simpan ke dalam file JSON
with open('formatted_classification_report.json', 'w') as json_file:
    json_file.write(formatted_report_json)


In [None]:
import pickle
pickle.dump(obj=onehot,file=open("pickle/onehot.pkl",'wb'))
pickle.dump(obj=tokenizer,file=open("pickle/tokenizer.pkl",'wb'))

In [None]:
from tensorflow import keras
model.save("h5/model.h5")

In [None]:
maxlen=train_data_pad.shape[1]#harus diingat untuk padding input

In [None]:
report_cnn=classification_report(y_true=test_data[["label"]],y_pred=prediction)

In [None]:
import json
json.dump(report_cnn,open("json/report_cnn.json","w"),indent=4)

In [None]:
train_data_pad.shape