In [1]:
# Load library
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

# Load constants
DATA_PATH = "data/twitter_sentiment"

# Checking Data

For initial data checking, you can take a look to the raw data itself.

In [2]:
df_train = pd.read_csv(f"{DATA_PATH}/twitter_training.csv", names=["unk_number", "game", "sentiment", "tweet"])
df_train

Unnamed: 0,unk_number,game,sentiment,tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [3]:
df_val = pd.read_csv(f"{DATA_PATH}/twitter_validation.csv", names=["unk_number", "game", "sentiment", "tweet"])
df_val

Unnamed: 0,unk_number,game,sentiment,tweet
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


We're going to focus on sentiment and the tweet. And also we're going to drop nan value

In [4]:
df_train = df_train[["tweet", "sentiment"]]
df_train = df_train.dropna()

In [5]:
df_val = df_val[["tweet", "sentiment"]]
df_val

Unnamed: 0,tweet,sentiment
0,I mentioned on Facebook that I was struggling ...,Irrelevant
1,BBC News - Amazon boss Jeff Bezos rejects clai...,Neutral
2,@Microsoft Why do I pay for WORD when it funct...,Negative
3,"CSGO matchmaking is so full of closet hacking,...",Negative
4,Now the President is slapping Americans in the...,Neutral
...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,Irrelevant
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,Irrelevant
997,Today sucked so it’s time to drink wine n play...,Positive
998,Bought a fraction of Microsoft today. Small wins.,Positive


We're going to look what type of "sentiment" we're going to predict

In [6]:
df_train["sentiment"].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

There are 4 type of "sentiment" we're going to predict based on tweet

# Model creation
For creating model you can choose 2 type of model. You can create from 0 or create using someone's model and specifying to our own problem (fine-tuning)  
In this case we're going to use fine tuning method instead.

In later cases you probably will use TFHub or Transformers library for fine-tuning.
Jika menggunakan TFHub, kalian bisa melihat cara penggunaannya

Kita akan menggunakan model yang telah dibuat seseorang dari link TFHub https://tfhub.dev/google/universal-sentence-encoder-multilingual/3

In [7]:
embed = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3", trainable=False)

In [8]:
embed(["halo", "semua"]).shape

TensorShape([2, 512])

Dari hasil percobaan dimasukkan suatu kata, kita bisa tahu bahwa akan diberikan data berukutan 512 dari setiap kata.

In [9]:
embed(["halo nama saya kaenova"]).shape

TensorShape([1, 512])

Pada kalimat pun akan diberikan data berukuran 512

Kita akan menggunakan model embed ini untuk mengubah kalimat menjadi suatu data yang berukuran 512 dan disesuaikan dengan permasalahan yang dimiliki. 
Pada kasus ini, kita harus mengklasifikasi ke 4 kategori

In [10]:
# Dari 512 -> 4
classification_head = tf.keras.layers.Dense(4, input_shape=(512,))

Kita coba gabungkan embed dan classification_head

In [11]:
x = ["halo nama saya kaenova"]
x = embed(x)
x = classification_head(x)
x

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0.00657169, 0.10108936, 0.03909671, 0.0846651 ]], dtype=float32)>

Jadi dehhh untuk percobaan modelnya. Sekarang kita harus wrap ini ke dalam model baru

In [12]:
# Buat model akhir
model = tf.keras.Sequential([
    embed,
    classification_head
])

In [13]:
# Kita coba sekarang
model(["halo kak kae!", "bro help me"])





<tf.Tensor: shape=(2, 4), dtype=float32, numpy=
array([[-0.05643825,  0.04471326, -0.01465607,  0.03812785],
       [-0.03114427,  0.00118975,  0.07174732,  0.09019516]],
      dtype=float32)>

# Feature Engineering

Berarti yang kita masukan ke dalam model adalah  
text -> angka  
  
Maslahnya sentimen kita pada csv masih dalam bentuk tulisan. Sebelum dimasukkan ke model, kita harus mengubahnya menjadi angka

In [14]:
df_train['sentiment'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [15]:
sentiment_dictionary = {
    "Positive": 0,
    "Neutral": 1,
    "Negative": 2,
    "Irrelevant": 3
}

In [16]:
df_train_model = df_train.copy()
df_train_model['sentiment'] = df_train['sentiment'].map(lambda x : sentiment_dictionary[x])
df_train_model

Unnamed: 0,tweet,sentiment
0,im getting on borderlands and i will murder yo...,0
1,I am coming to the borders and I will kill you...,0
2,im getting on borderlands and i will kill you ...,0
3,im coming on borderlands and i will murder you...,0
4,im getting on borderlands 2 and i will murder ...,0
...,...,...
74677,Just realized that the Windows partition of my...,0
74678,Just realized that my Mac window partition is ...,0
74679,Just realized the windows partition of my Mac ...,0
74680,Just realized between the windows partition of...,0


In [17]:
df_val_model = df_val.copy()
df_val_model['sentiment'] = df_val['sentiment'].map(lambda x : sentiment_dictionary[x])
df_val_model

Unnamed: 0,tweet,sentiment
0,I mentioned on Facebook that I was struggling ...,3
1,BBC News - Amazon boss Jeff Bezos rejects clai...,1
2,@Microsoft Why do I pay for WORD when it funct...,2
3,"CSGO matchmaking is so full of closet hacking,...",2
4,Now the President is slapping Americans in the...,1
...,...,...
995,⭐️ Toronto is the arts and culture capital of ...,3
996,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,3
997,Today sucked so it’s time to drink wine n play...,0
998,Bought a fraction of Microsoft today. Small wins.,0


# Model Training

Nah data yang dipunya sudah sesuai dengan yang model bisa mengerti. Sekarang tinggal di train dengan loss yang disesuaikan dengan kebutuhan

In [18]:
# Hati2 untuk nilai loss, harus disesuaikan dengan data yang dipunya.
# Kita memiliki 4 output dari model yang dibandingkan dengan 1 nilai pada dataset
# Untuk klasifikasi

# Hal yang tepat ialah menggunakan
# https://keras.io/api/losses/probabilistic_losses/#sparsecategoricalcrossentropy-function
# TODO: Ajari baca dokumentasi loss function keras
model.compile(optimizer="adam", 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [19]:
result = model.fit(
          epochs=2,
          x=df_train_model['tweet'].values,
          y=df_train_model['sentiment'].values,
          batch_size=32,
          validation_data=(df_val_model['tweet'].values, 
                           df_val_model['sentiment'].values)
)

Epoch 1/2
Epoch 2/2


In [20]:
# TRAINING CUMAN 2 LAYER 
# (CNN EMBEDDING LAYER WITHOUT TRAINING IT AND ONLY CLASSIFICATION HEAD) BROO

# Let's Change the model so it has more parameters

In [21]:
# Buat model akhir
model_new = tf.keras.Sequential([
    embed,
    tf.keras.layers.Dense(256, activation='relu', input_shape=(512,)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(4)
])

model_new.compile(
    optimizer="adam",
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

In [22]:
model_new(["test"])





<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0.02340199, 0.02422091, 0.00795818, 0.01294736]], dtype=float32)>

In [23]:
# Train for 2 epoch
result_new = model_new.fit(
          epochs=2,
          x=df_train_model['tweet'].values,
          y=df_train_model['sentiment'].values,
          batch_size=32,
          validation_data=(df_val_model['tweet'].values,
                           df_val_model['sentiment'].values)
)

Epoch 1/2
Epoch 2/2


In [24]:
# Train again for antoher 2 epoch
result_new = model_new.fit(
          epochs=2,
          x=df_train_model['tweet'].values,
          y=df_train_model['sentiment'].values,
          batch_size=32,
          validation_data=(df_val_model['tweet'].values,
                           df_val_model['sentiment'].values)
)

Epoch 1/2
Epoch 2/2


In [25]:
# Train again for antoher 2 epoch
result_new = model_new.fit(
          epochs=2,
          x=df_train_model['tweet'].values,
          y=df_train_model['sentiment'].values,
          batch_size=32,
          validation_data=(df_val_model['tweet'].values,
                           df_val_model['sentiment'].values)
)

Epoch 1/2
Epoch 2/2


In [26]:
# Train again for antoher 2 epoch
result_new = model_new.fit(
          epochs=2,
          x=df_train_model['tweet'].values,
          y=df_train_model['sentiment'].values,
          batch_size=32,
          validation_data=(df_val_model['tweet'].values,
                           df_val_model['sentiment'].values)
)

Epoch 1/2
Epoch 2/2


In [27]:
# Train again for antoher 2 epoch
result_new = model_new.fit(
          epochs=2,
          x=df_train_model['tweet'].values,
          y=df_train_model['sentiment'].values,
          batch_size=32,
          validation_data=(df_val_model['tweet'].values,
                           df_val_model['sentiment'].values)
)

Epoch 1/2
Epoch 2/2


We can stop here.  
Kenapa? Karena dilihat loss training mengecil, tetapi loss validation mulai membesar