In [None]:
from tensorflow.keras import layers
from tqdm import tqdm

import GetOldTweets3 as got

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd 
import numpy as np 
import copy
import os

In [None]:
data = pd.read_csv("./data/data_train.csv",encoding='latin-1')
data.head()

In [None]:
print(data['Emotion'].unique())

In [None]:
data_ = copy.deepcopy(data)

netral_data = data_[data_.Emotion=='neutral']
sadness_data = data_[data_.Emotion=='sadness']
fear_data = data_[data_.Emotion=='fear']
anger_data = data_[data_.Emotion=='anger']
joy_data = data_[data_.Emotion=='joy']

sub_data = pd.concat([netral_data, sadness_data, fear_data, anger_data, joy_data], axis=0)

# Data explore

In [None]:
data_target=data.groupby('Emotion')

In [None]:
data['Emotion'].value_counts()

## Date

At what time do people like to tweet? Is there a clear link between the time of tweeting and the emotion of the content?

In [None]:
data.head()

## Words

Words distribution.

In [None]:
from nltk.tokenize import word_tokenize

import warnings
import string
import nltk
import re

nltk.download('stopwords')

# Data preprocess

In [None]:
netral_data = data_[data_.Emotion=='neutral']
print(f"netural data shape : {netral_data.shape}")

sadness_data = data_[data_.Emotion=='sadness']
print(f"sadness data shape : {sadness_data.shape}")

fear_data = data_[data_.Emotion=='fear']
print(f"feat data shape : {fear_data.shape}")

anger_data = data_[data_.Emotion=='anger']
print(f"anger data shape : {anger_data.shape}")

joy_data = data_[data_.Emotion=='joy']
print(f"joy data shape : {joy_data.shape}")

emo_data = pd.concat([netral_data, sadness_data, fear_data, anger_data, joy_data], axis=0)
print(data.shape)

emo_data.head()

---

In [None]:
data = pd.read_csv("./data/training.1600000.processed.noemoticon.csv",encoding='latin-1')

In [None]:
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "TweetText"]
data.columns = DATASET_COLUMNS

In [None]:
data_ = copy.deepcopy(data)

positif_data = data_[data_.target==4].iloc[:80000,:]
netural_data = data_[data_.target==2].iloc[:80000, :]
negative_data = data_[data_.target==0].iloc[:80000,:]

sub_data = pd.concat([positif_data, netural_data, negative_data],axis=0)

# Data explore

In [None]:
data_target=data.groupby('target')

In [None]:
data['target'].value_counts()

## Date

At what time do people like to tweet? Is there a clear link between the time of tweeting and the emotion of the content?

In [None]:
data.head()

In [None]:
data_ = {'target': data['target'], 'date': data['date']}
df = pd.DataFrame(data_)
df.head()

In [None]:
# lets ensure the 'date' column is in date format
df['date'] = pd.to_datetime(df['date'])

In [None]:
hour = [ df['date'][i].hour for i in range(len(df['date'])) ]
df['hour'] = hour
df.head()

In [None]:
hour_data = {'0': [0]*24, '2': [0]*24, '4': [0]*24}
for i in range(len(df['hour'])):
    target = str(df['target'][i])
    hour = int(df['hour'][i])
    hour_data[target][hour] += 1

In [None]:
hour_data = [hour_data['0'], hour_data['2'], hour_data['4']]
# Transpose
hour_data = list(map(list,zip(*hour_data)))

## Words

Words distribution.

In [None]:
newdata = copy.deepcopy(sub_data)
newdata.drop(['ids','date','flag','user'],axis = 1,inplace = True)

In [None]:
from nltk.tokenize import word_tokenize

import warnings
import string
import nltk
import re

nltk.download('stopwords')

# Data preprocess

In [None]:
positif_data = data[data.target==4]
print(positif_data.shape)

negative_data = data[data.target==0]
print(negative_data.shape)

deep_data = pd.concat([positif_data,negative_data],axis = 0)
print(data.shape)

deep_data.head()

In [None]:
data['Clean_TweetText'] = data['TweetText'].str.replace("@", "") 
data['Clean_TweetText'] = data['Clean_TweetText'].str.replace(r"http\S+", "") 
data['Clean_TweetText'] = data['Clean_TweetText'].str.replace("[^a-zA-Z]", " ") 

stopwords=nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    clean_text=' '.join([word for word in text.split() if word not in stopwords])
    return clean_text

In [None]:
data['Clean_TweetText'] = data['Clean_TweetText'].apply(lambda text : remove_stopwords(text.lower()))
data.head()

In [None]:
# Text Tokenization and Normalization
data['Clean_TweetText'] = data['Clean_TweetText'].apply(lambda x: word_tokenize(x))
data.head()

In [None]:
# Now let’s stitch these tokens back together
data['Clean_TweetText'] = data['Clean_TweetText'].apply(lambda x: ' '.join([w for w in x]))
# Removing small words
data['Clean_TweetText'] = data['Clean_TweetText'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
data.head()

# Models

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from imblearn.under_sampling import RandomUnderSampler

from tensorflow.keras import layers
from tensorflow.keras import Model
from keras.utils import plot_model

import tensorflow as tf

In [None]:
count_vectorizer = CountVectorizer(stop_words='english') 
emo_cv = count_vectorizer.fit_transform(emo_data['Text'])
print(emo_cv.shape)

In [None]:
undersampler_ratio = {
    "neutral" : 1000,
    "joy" : 1000,
    "sadness" : 1000,
    "anger" : 1000,
    "fear" : 1000
}

rus = RandomUnderSampler(random_state=42, sampling_strategy=undersampler_ratio)
emo_X, emo_y = rus.fit_resample(emo_cv, emo_data['Emotion'])

print(emo_X.shape)
print(emo_y.shape)

In [None]:
print(emo_data['Emotion'].unique())

onehot_label = list()
for value in emo_y:
    if value == "neutral":
        onehot_label.append([1, 0, 0, 0, 0])

    elif value == "sadness":
        onehot_label.append([0, 1, 0, 0, 0])

    elif value == "fear":
        onehot_label.append([0, 0, 1, 0, 0])

    elif value == "anger":
        onehot_label.append([0, 0, 0, 1, 0])

    elif value == "joy":
        onehot_label.append([0, 0, 0, 0, 1])

    else:
        break

onehot_label = np.array(onehot_label)

In [None]:
emo_X = emo_X.toarray()

emo_X_train, emo_X_test, emo_y_train, emo_y_test = train_test_split(emo_X, onehot_label, test_size=.33, random_state=42)
print(f'''
X_train shape : {emo_X_train.shape}
y_train shape : {emo_y_train.shape}

X_test shape : {emo_X_test.shape}
y_test shape : {emo_y_test.shape}
''')

# Deepression y onehot

In [None]:
count_vectorizer = CountVectorizer(stop_words='english') 
deep_cv = count_vectorizer.fit_transform(data['Clean_TweetText'])
deep_cv.shape

In [None]:
undersampler_ratio = {
    0 : 2500,
    4 : 2500
}

rus = RandomUnderSampler(random_state=42, sampling_strategy=undersampler_ratio)
deep_X, deep_y = rus.fit_resample(deep_cv, data['target'])

print(deep_X.shape)
print(deep_y.shape)

In [None]:
deep_y_onehot = list()
for value in deep_y:
    if value == 0:
        deep_y_onehot.append([1, 0])

    elif value == 4:
        deep_y_onehot.append([0, 1])
    
    else:
        break

deep_y_onehot = np.array(deep_y_onehot)
print(deep_y_onehot.shape)

In [None]:
deep_X = deep_X.toarray()

deep_X_train, deep_X_test, deep_y_train, deep_y_test = train_test_split(deep_X, deep_y_onehot, test_size=.33, random_state=42)
print(f'''
X_train shape : {deep_X_train.shape}
y_train shape : {deep_y_train.shape}

X_test shape : {deep_X_test.shape}
y_test shape : {deep_y_test.shape}
''')

In [None]:
emo_input_layer = layers.Input(shape=(1, 9887), name="emotion_model_input")

x1 = layers.Conv1D(4096, 3, padding='same', activation='relu')(emo_input_layer)
x1 = layers.Conv1D(2048, 3, padding='same', activation='relu')(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x1)

# x1 = layers.Embedding(2048, 1024)(x1)
# x1 = layers.GlobalAveragePooling2D()(x1)
# x1 = layers.Reshape((1, 1024))(x1)

x1 = layers.Conv1D(1024, 3, padding='same', activation='relu')(x1)
x1 = layers.Conv1D(1024, 3, padding='same', activation='relu')(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x1)

x1 = layers.Conv1D(512, 3, padding='same', activation='relu')(x1)
x1 = layers.Conv1D(512, 3, padding='same', activation='relu')(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x1)
x1 = layers.Dropout(0.5)(x1)

x1 = layers.Conv1D(512, 3, padding='same', activation='relu')(x1)
x1 = layers.Conv1D(512, 3, padding='same', activation='relu')(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x1)

x1 = layers.Conv1D(256, 3, padding='same', activation='relu')(x1)
x1 = layers.Conv1D(256, 3, padding='same', activation='relu')(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x1)
temp_x1 = layers.Flatten()(x1)
temp_emo_y = layers.Dense(5, activation='softmax', name="Before_GRU_emo")(temp_x1)

x1 = layers.Reshape((1, 256))(x1)
x1 = layers.GRU(256)(x1)
x1 = layers.Dropout(0.5)(x1)

x1 = layers.Flatten()(x1)
x1 = layers.Dense(50)(x1)
x1 = layers.Dense(30)(x1)
x1 = layers.Dense(15)(x1)
emo_y = layers.Dense(5, activation='softmax', name="final")(x1)
emo_model = Model(inputs=emo_input_layer, outputs=[emo_y, temp_emo_y])

In [None]:
emo_model.summary()

In [None]:
# deep_input_layer = layers.Input(shape=(1, 546404), name="deep_model_input")
# x = layers.Conv1D(2048, 3, padding='same', activation='relu')(deep_input_layer)

# x = layers.Conv1D(1024, 3, padding='same', activation='relu')(x)
# x = layers.Conv1D(1024, 3, padding='same', activation='relu')(x)
# x = layers.BatchNormalization()(x)
# x = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x)

# x = layers.Conv1D(1024, 3, padding='same', activation='relu')(x)
# x = layers.Conv1D(1024, 3, padding='same', activation='relu')(x)
# x = layers.BatchNormalization()(x)
# x = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x)

# x = layers.Conv1D(512, 3, padding='same', activation='relu')(x)
# x = layers.Conv1D(512, 3, padding='same', activation='relu')(x)
# x = layers.BatchNormalization()(x)
# x = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x)

# x = layers.Conv1D(512, 3, padding='same', activation='relu')(x)
# x = layers.Conv1D(512, 3, padding='same', activation='relu')(x)
# x = layers.BatchNormalization()(x)
# x = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x)

# x = layers.Conv1D(256, 3, padding='same', activation='relu')(x)
# x = layers.Conv1D(256, 3, padding='same', activation='relu')(x)
# x = layers.BatchNormalization()(x)
# x = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x)

# x = layers.Conv1D(256, 3, padding='same', activation='relu')(x)
# x = layers.Conv1D(256, 3, padding='same', activation='relu')(x)
# x = layers.BatchNormalization()(x)
# x = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x)
# temp_deep_y = layers.Dense(2, activation='softmax', name="Before_GRU_deep")(x)

# x = layers.Reshape((1, 256))(x)
# x = layers.GRU(256)(x)

# x = layers.Reshape((4, 64))(x)
# x = layers.GRU(64)(x)

# x = layers.Dense(50)(x)
# x = layers.Dense(30)(x)
# x = layers.Dense(15)(x)
# deep_y = layers.Dense(2, activation='softmax')(x)
# deep_model = Model(inputs=deep_input_layer, outputs=[deep_y, temp_deep_y])

In [None]:
# deep_model.summary()

In [None]:
# combined_model = Model(inputs=[deep_input_layer, emo_input_layer], outputs=[temp_deep_y, deep_y, temp_emo_y, emo_y])
# combined_model.summary()

In [None]:
optim = tf.keras.optimizers.Adam(
    lr=0.003
)

emo_model.compile(
    loss='categorical_crossentropy',
    optimizer=optim,
    metrics=['accuracy']
)

plot_model(emo_model, show_shapes=True, to_file='model_visualization.png')

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

outDir = './cheakpoint/' 
model_names = outDir + 'weights-{val_final_accuracy:.4f}.h5'
def get_callbacks(patience = 50):
    model_checkpoint = ModelCheckpoint(model_names, monitor='val_final_accuracy', verbose=1, save_best_only=True, period = 1)
    callbacks = [model_checkpoint]

    return callbacks

In [None]:
emo_X_train = emo_X_train.reshape(-1, 1, 9887)
emo_X_test = emo_X_test.reshape(-1, 1, 9887)

In [None]:
emo_X_test.shape
emo_y_test.shape

In [None]:
callbacks = get_callbacks()

history = emo_model.fit(
    emo_X_train, emo_y_train,
    shuffle=True,
    batch_size=128,
    epochs=100,
    validation_data=(emo_X_test, emo_y_test),
    callbacks=[callbacks]
)

In [None]:
def plot_model__hist(hist):
    path = './cheakpoint/lefms/' # loss, accuracy 그래프 저장할 path

    # loss 추이 그래프로 그려서 저장
    plt.figure(figsize=(6,6))
    plt.style.use("ggplot")
    plt.plot(hist.history['loss'], color='b', label="Training loss")
    plt.plot(hist.history['val_loss'], color='r', label="Validation loss")
    plt.legend()
    plt.show()

    # accuracy 추이 그래프로 그려서 저장
    plt.figure(figsize=(6,6))
    plt.style.use("ggplot")
    plt.plot(hist.history['accuracy'], color='b', label="Training accuracy")
    plt.plot(hist.history['val_accuracy'], color='r',label="Validation accuracy")
    plt.legend(loc = "lower right")
    plt.show()

In [None]:
plot_model__hist(history)
loss,acc = emo_model.evaluate(emo_X_test, emo_y_test, verbose=2)
print("multi_model의 정확도: {:5.2f}%".format(100*acc))
print("multi_model의 Loss: {}".format(loss))

# Model Test (Confusion Matrix)

In [None]:
from tesnorflow import keras

model_path = './checkpoint/'
model_path = model_path + sorted(os.listdir(model_path))[-1]

recon_model = keras.models.load_model(model_path)
print(model_path)