In [1]:
from tensorflow.keras import layers, Model
from tqdm import tqdm

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd 
import numpy as np 
import copy
import os

In [2]:
data = pd.read_csv("./data/train.csv",encoding='latin-1')
data = data.iloc[:20000]
data.head()

Unnamed: 0,tweets,labels
0,sending solidarity whoever doctor manage incre...,Stressed
1,need see hair amp beard gat book appointment b...,Anxious
2,next time meet someone new dont ask ask love,Normal
3,surprise someone love give la senza gift box r...,Lonely
4,raise hand junhoes ocean lotion life rent free...,Normal


In [3]:
print(data['labels'].unique())

['Stressed' 'Anxious' 'Normal' 'Lonely']


In [4]:
data_ = copy.deepcopy(data)

stressed_data = data_[data_.labels=='Stressed']
anxious_data  = data_[data_.labels=='Anxious']
normal_data   = data_[data_.labels=='Normal']
lonely_data   = data_[data_.labels=='Lonely']
sub_data = pd.concat([stressed_data, anxious_data, normal_data, lonely_data],axis=0)

# Data explore

In [5]:
data_target=data.groupby('labels')

In [6]:
data['labels'].value_counts()

Anxious     5502
Normal      5355
Lonely      4604
Stressed    4539
Name: labels, dtype: int64

## Words

Words distribution.

In [7]:
from nltk.tokenize import word_tokenize

import warnings
import string
import nltk
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bahk_insung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data preprocess

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras import layers

import tensorflow as tf

In [9]:
count_vectorizer = CountVectorizer(stop_words='english') 
cv = count_vectorizer.fit_transform(data['tweets'])
cv.shape

(20000, 18296)

In [10]:
print(data['labels'].unique())

['Stressed' 'Anxious' 'Normal' 'Lonely']


In [11]:
y_label_onehot = list()

for value in data['labels']:
    if value == "Stressed":
        y_label_onehot.append([1, 0, 0, 0])

    elif value == "Anxious":
        y_label_onehot.append([0, 1, 0, 0])

    elif value == "Normal":
        y_label_onehot.append([0, 0, 1, 0])

    elif value == "Lonely":
        y_label_onehot.append([0, 0, 0, 1])

    else:
        break

X = cv.toarray()
y_label_onehot = np.array(y_label_onehot)
print(y_label_onehot.shape)

(20000, 4)


In [12]:
X = X.reshape(-1, 1, 18296)

X_train, X_test, y_train, y_test = train_test_split(X, y_label_onehot, test_size=.2, random_state=42)
print(f'''
X_train shape : {X_train.shape}
y_train shape : {y_train.shape}

X_test shape : {X_test.shape}
y_test shape : {y_test.shape}
''')


X_train shape : (16000, 1, 18296)
y_train shape : (16000, 4)

X_test shape : (4000, 1, 18296)
y_test shape : (4000, 4)



In [14]:
emo_input_layer = layers.Input(shape=(1, 18296), name="emotion_model_input")

gru_x = layers.GRU(9000)(emo_input_layer)
gru_x = layers.Reshape(1, 9000)
gru_x = layers.Conv1D(4096, 3, padding='same', activation='relu')(gru_x)
gru_x = layers.Conv1D(4096, 3, padding='same', activation='relu')(gru_x)
gru_x = layers.BatchNormalization()(gru_x)
gru_x = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(gru_x)

x1 = layers.Conv1D(1024, 3, padding='same', activation='relu')(emo_input_layer)
x1 = layers.Conv1D(1024, 3, padding='same', activation='relu')(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x1)

x1 = layers.Conv1D(512, 3, padding='same', activation='relu', kernel_regularizer=tf.keras.regularizers.L2(0.01))(x1)
x1 = layers.Conv1D(512, 3, padding='same', activation='relu')(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x1)
x1 = layers.Dropout(0.5)(x1)
# temp_x1 = layers.Flatten()(x1)
# temp_emo_y = layers.Dense(4, activation='softmax', name="Before_GRU_emo")(temp_x1)

x1 = layers.Conv1D(512, 3, padding='same', activation='relu')(x1)
x1 = layers.Conv1D(512, 3, padding='same', activation='relu')(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x1)

x1 = layers.Concatenate()([gru_x, x1])

x1 = layers.Conv1D(256, 3, padding='same', activation='relu',  kernel_regularizer=tf.keras.regularizers.L2(0.01))(x1)
x1 = layers.Conv1D(256, 3, padding='same', activation='relu')(x1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.MaxPool1D(pool_size=(2), strides=2, padding='same')(x1)
x1 = layers.Reshape((1, 256))(x1)
x1 = layers.GRU(256)(x1)
x1 = layers.Dropout(0.7)(x1)

x1 = layers.Flatten()(x1)
x1 = layers.Dense(50)(x1)
x1 = layers.Dense(30)(x1)
x1 = layers.Dense(15)(x1)
emo_y = layers.Dense(4, activation='softmax', name="final")(x1)
model = Model(inputs=emo_input_layer, outputs=[emo_y])

TypeError: Reshape.__init__() takes 2 positional arguments but 3 were given

In [None]:
model.summary()

In [None]:
optim = tf.optimizers.Adam(
    lr = 0.003
)

model.compile(
    loss='categorical_crossentropy',
    optimizer=optim,
    metrics=['accuracy']
)

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

outDir = './cheakpoint/' 
model_names = outDir + 'weights-{val_final_accuracy:.4f}.h5'
def get_callbacks(patience = 50):
    model_checkpoint = ModelCheckpoint(model_names, monitor='val_final_accuracy', verbose=1, save_best_only=True, period = 1)
    callbacks = [model_checkpoint]

    return callbacks

In [None]:
callbacks = get_callbacks()

history = model.fit(
    X_train, y_train,
    shuffle=True,
    batch_size=20,
    epochs=100,
    validation_data=(X_test, y_test),
    callbacks=[callbacks]
)