In [1]:
import numpy as np
import nltk
import pandas as pd

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Karunia
[nltk_data]     Perjuangan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
idn_stopwords = list(stopwords.words('indonesian'))


In [4]:
idn_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga','gue','gw', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 'kau','ku','terus','trs',
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 'tp', 'tpi','bgt',
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])

In [5]:
def check_label(label1:str,label2:str) -> int:
    if label1 == label2:
        return 1
    else:
        return 0


In [6]:
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def remove_unnecessary_char(text):
  text = re.sub("\[USERNAME\]", " ", text)
  text = re.sub("\[URL\]", " ", text)
  text = re.sub("\[SENSITIVE-NO\]", " ", text)
  text = re.sub('  +', ' ', text)
  return text

def preprocess_tweet(text):
  text = re.sub('\n',' ',text) # Remove every '\n'
  # text = re.sub('rt',' ',text) # Remove every retweet symbol
  text = re.sub('^(\@\w+ ?)+',' ',text)
  text = re.sub(r'\@\w+',' ',text) # Remove every username
  text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
  text = re.sub('/', ' ', text)
  # text = re.sub(r'[^\w\s]', '', text)
  text = re.sub('  +', ' ', text) # Remove extra spaces
  return text
    
def remove_nonaplhanumeric(text):
  text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
  return text

def remove_stopword(text):
  text = ' '.join(['' if word in idn_stopwords else word for word in text.split(' ')])
  text = re.sub('  +', ' ', text)
  text = text.strip()
  return text
  

def sastrawi_stemming(text):
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()
  text = stemmer.stem(text)
  return text

In [7]:
def preprocess(text):
  text = preprocess_tweet(text)
  text = remove_unnecessary_char(text)
  text = text.lower()
  text = remove_nonaplhanumeric(text)
  text = remove_stopword(text)
  text = sastrawi_stemming(text)
  return text

Memberi indeks nilai 0 atau 1 pada setiap jenis label

In [8]:
train_df_unbalanced = pd.read_csv('./emot_emotion-twitter/train_preprocess.csv')

train_df = train_df_unbalanced.groupby('label').apply(lambda x: x.sample(500)).reset_index(drop=True)
train_df.value_counts('label')

label
anger      500
fear       500
happy      500
love       500
sadness    500
dtype: int64

In [9]:
train_df['tweet'] = train_df['tweet'].apply(preprocess)
for emotion in ['happy','anger','fear','love','sadness']:
    train_df[emotion] = train_df.apply(lambda row: check_label(row['label'],emotion),axis=1)

In [10]:
def build_freqs(df):
    freqs = {}
    for row in df.itertuples():
        for word in row.tweet.split(' '):
            pair = (row.label,word)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

freqs = build_freqs(train_df)



In [11]:
[key for key in freqs.keys() if freqs[key] >30 ]

[('anger', 'salah'),
 ('anger', 'udah'),
 ('anger', 'orang'),
 ('anger', 'gitu'),
 ('anger', 'lu'),
 ('fear', 'rem'),
 ('fear', 'gimana'),
 ('fear', 'jalan'),
 ('fear', 'takut'),
 ('fear', 'orang'),
 ('fear', 'ngeri'),
 ('fear', 'pake'),
 ('fear', 'banget'),
 ('fear', 'gitu'),
 ('fear', 'udah'),
 ('fear', 'pas'),
 ('fear', 'sakit'),
 ('fear', 'rumah'),
 ('fear', 'temen'),
 ('fear', 'salah'),
 ('fear', 'sampe'),
 ('fear', 'hati'),
 ('fear', 'liat'),
 ('fear', 'w'),
 ('fear', 'gua'),
 ('happy', 'orang'),
 ('happy', 'happy'),
 ('happy', 'moga'),
 ('happy', 'bahagia'),
 ('happy', 'alhamdulillah'),
 ('happy', 'udah'),
 ('happy', 'banget'),
 ('happy', 'anak'),
 ('happy', 'suka'),
 ('happy', 'kasih'),
 ('happy', 'selamat'),
 ('love', 'udah'),
 ('love', 'gua'),
 ('love', 'jatuh'),
 ('love', 'cinta'),
 ('love', 'sayang'),
 ('love', 'suka'),
 ('love', 'love'),
 ('love', 'orang'),
 ('love', 'banget'),
 ('love', 'i'),
 ('love', 'you'),
 ('love', 'hati'),
 ('love', 'kasih'),
 ('love', 'senyum'),
 (

# Regresi Logistik

In [12]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

def gradient_descent(X,y,theta,alpha,iteration):
    m = len(y)


    for i in range(iteration):
        h = sigmoid(X @ theta)
        J = (-1/m) * (y.T @ np.log(h) + (1-y).T @ np.log(1-h))
        #if (i+1)%500 == 0:
        #    print(f"Loss Iteration {i+1}: {float(J)}")
        theta = theta - alpha/m * X.T @ (h-y)
    print(f"Final Loss: {J}")
    print(theta)
    return theta

Feature extraction

In [13]:
def extract_features(tweet):
    processed_tweet = preprocess(tweet)
    words = processed_tweet.split(' ')

    x = np.zeros((1,6)) # Bias + 5 emotions (Happy,Anger,Fear,Love,Sadness)
    x[0,0] = 1

    for word in words:
        x[0,1] += freqs.get(('happy',word),0)
        x[0,2] += freqs.get(('anger',word),0)
        x[0,3] += freqs.get(('fear',word),0)
        x[0,4] += freqs.get(('love',word),0)
        x[0,5] += freqs.get(('sadness',word),0)
    assert x.shape == (1,6)
    return x

In [14]:
#Tes
extract_features("kamu jelek banget deh")

array([[ 1., 66., 34., 75., 89., 47.]])

Training Model

In [15]:
X = np.zeros((len(train_df),6))
for i in range(len(train_df)):
    X[i,:] = extract_features(train_df['tweet'][i])




In [16]:
X.shape

(2500, 6)

In [53]:
[theta_happy,theta_anger,theta_fear,theta_love,theta_sadness] = [np.zeros((6,1)) for i in range(5)]
list_theta = [theta_happy,theta_anger,theta_fear,theta_love,theta_sadness]

[Y_happy,Y_anger,Y_fear,Y_love,Y_sadness] = [train_df[emotion].values.reshape(-1,1) for emotion in ['happy','anger','fear','love','sadness']]
list_Y = [Y_happy,Y_anger,Y_fear,Y_love,Y_sadness]

len(list(zip(list_theta,list_Y)))

[theta_happy,theta_anger,theta_fear,theta_love,theta_sadness] = [gradient_descent(X,Y,theta,1e-7,5000) for theta,Y in zip(list_theta,list_Y)]


Final Loss: [[0.45231885]]
[[-3.03360230e-05]
 [ 1.96562408e-03]
 [-2.30020817e-03]
 [-4.02073332e-03]
 [-3.28024441e-03]
 [-1.90637626e-03]]
Final Loss: [[0.43673554]]
[[-2.96929028e-05]
 [-2.39247924e-03]
 [ 2.42719573e-03]
 [-3.38929593e-03]
 [-4.48728670e-03]
 [-1.74003053e-03]]
Final Loss: [[0.38331063]]
[[-6.28525873e-05]
 [-4.12806088e-03]
 [-3.64019281e-03]
 [ 5.71363199e-03]
 [-5.33884065e-03]
 [-3.56748712e-03]]
Final Loss: [[0.34671125]]
[[-7.06752269e-05]
 [-4.16589876e-03]
 [-4.76806607e-03]
 [-5.86872419e-03]
 [ 5.43801489e-03]
 [-4.04873258e-03]]
Final Loss: [[0.49383138]]
[[-3.82714487e-05]
 [-2.10429727e-03]
 [-1.82670907e-03]
 [-3.39857020e-03]
 [-2.77815290e-03]
 [ 1.34629703e-03]]


In [54]:
print(theta_fear)

[[-6.28525873e-05]
 [-4.12806088e-03]
 [-3.64019281e-03]
 [ 5.71363199e-03]
 [-5.33884065e-03]
 [-3.56748712e-03]]


In [55]:
def predict_emotion(tweet):
    x = extract_features(tweet)
    probs = [sigmoid(x @ theta) for theta in [theta_happy,theta_anger,theta_fear,theta_love,theta_sadness]]
    softmax = np.exp(probs)/np.sum(np.exp(probs))
    label = ['happy','anger','fear','love','sadness'][np.argmax(softmax)]
    return label

Testing in Validation Dataset

In [56]:
test_df = pd.read_csv('./emot_emotion-twitter/valid_preprocess.csv')

In [57]:
#accuracy
def calculate_accuracy(df):
    df['predicted_label'] = df['tweet'].apply(predict_emotion)
    correct = 0
    for index,row in df.iterrows():
        if row['label'] == row['predicted_label']:
            correct += 1
    return correct/len(df)

In [58]:
calculate_accuracy(test_df)

0.5136363636363637

In [59]:
pd.crosstab(test_df['label'],test_df['predicted_label'],normalize='index')

predicted_label,anger,fear,happy,love,sadness
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
anger,0.463636,0.309091,0.027273,0.054545,0.145455
fear,0.076923,0.692308,0.015385,0.046154,0.169231
happy,0.04902,0.186275,0.382353,0.147059,0.235294
love,0.0,0.0625,0.03125,0.84375,0.0625
sadness,0.121212,0.232323,0.050505,0.222222,0.373737
