In [29]:
import re
import nltk
import numpy as np
import pandas as pd
import requests
import tensorflow as tf
from nltk.corpus import stopwords
nltk.download('stopwords')
from itertools import groupby
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences, to_categorical
from tensorflow.keras import Sequential
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.layers import Embedding, Dense, Flatten, LSTM, GRU, GlobalAveragePooling1D, Bidirectional, Dropout, BatchNormalization, Input, Conv2D, MaxPool2D, Reshape
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
# Load dataset
dataset = pd.read_csv('./twitter_MBTI.csv', index_col=0)

In [31]:
# 전처리 버전 3 함수화
# Define preprocessing functions
def preprocess_text(texts):
    stop_words = set(stopwords.words('english'))
    texts = texts.apply(lambda x: re.sub(r'@\w+\s?', '', x)) # remove usernames
    texts = texts.apply(lambda x: re.sub(r'https?:\/\/\S+', '', x)) # remove links
    texts = texts.apply(lambda x: x.lower()) # convert to lowercase
    texts = texts.apply(lambda x: re.findall(r'\b\w+\b', x)) # split into individual words
#     texts = texts.apply(lambda x: [word for word in x if word not in stop_words]) # remove stop words
    return texts

In [32]:
# Preprocess text
texts = preprocess_text(dataset['text'])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
max_len = max(len(s) for s in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len)

In [33]:
max_len

4491

In [34]:
# Save tokenizer
import pickle

# saving
with open('./models/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('./models/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [35]:
# Preprocess labels
label_dict = {'istj': 0, 'isfj': 1, 'infj': 2, 'intj': 3, 'istp': 4, 'isfp': 5, 'infp': 6, 'intp': 7,
              'estp': 8, 'esfp': 9, 'enfp': 10, 'entp': 11, 'estj': 12, 'esfj': 13, 'enfj': 14, 'entj': 15}
labels = [label_dict[label] for label in dataset['label']]
labels_cat = to_categorical(labels, num_classes=len(label_dict))

In [36]:
# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels_cat, test_size=0.1, random_state=41)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(7029, 4491) (7029, 16)
(782, 4491) (782, 16)


In [27]:
# Test: CNN-Transformer
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
embed_dim = 32  # Embedding size for each token
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
model_input = Input(shape=(X_train.shape[1],))
h = Embedding(len(word_index)+1, 64, input_length=max_len)(model_input)
h = Reshape((X_train.shape[1], 64, 1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,2))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,2))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = TransformerBlock(embed_dim, num_heads, ff_dim)(h)
h = Flatten()(h)
# h = Dense(64, activation='relu')(h)
model_output = Dense(16, activation='softmax')(h)
model = Model(model_input, model_output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
es = EarlyStopping(monitor='val_loss', mode='min', patience=8, restore_best_weights=True)
model.fit(X_train, y_train, epochs=200, batch_size=16, validation_data=(X_test, y_test), callbacks=[es])

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 4491)]            0         
                                                                 
 embedding (Embedding)       (None, 4491, 64)          15050944  
                                                                 
 reshape (Reshape)           (None, 4491, 64, 1)       0         
                                                                 
 conv2d (Conv2D)             (None, 4491, 64, 32)      160       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 2245, 32, 32)     0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 2245, 32, 32)      4128      
                                                             

KeyboardInterrupt: 

In [28]:
model.save('./models/model_convolution_transformer.h5')

NotImplementedError: 
Layer TransformerBlock has arguments ['embed_dim', 'num_heads', 'ff_dim', 'rate']
in `__init__` and therefore must override `get_config()`.

Example:

class CustomLayer(keras.layers.Layer):
    def __init__(self, arg1, arg2):
        super().__init__()
        self.arg1 = arg1
        self.arg2 = arg2

    def get_config(self):
        config = super().get_config()
        config.update({
            "arg1": self.arg1,
            "arg2": self.arg2,
        })
        return config

In [37]:
# Test: CNN
model_input = Input(shape=(X_train.shape[1],))
h = Embedding(len(word_index)+1, 64, input_length=max_len)(model_input)
h = Reshape((X_train.shape[1], 64, 1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,2))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,2))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)

h = Flatten()(h)
# h = Dense(64, activation='relu')(h)
model_output = Dense(16, activation='softmax')(h)
model = Model(model_input, model_output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
es = EarlyStopping(monitor='val_loss', mode='min', patience=8, restore_best_weights=True)
model.fit(X_train, y_train, epochs=200, batch_size=16, validation_data=(X_test, y_test), callbacks=[es])

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 4491)]            0         
                                                                 
 embedding_1 (Embedding)     (None, 4491, 64)          15050944  
                                                                 
 reshape_1 (Reshape)         (None, 4491, 64, 1)       0         
                                                                 
 conv2d_10 (Conv2D)          (None, 4491, 64, 32)      160       
                                                                 
 max_pooling2d_10 (MaxPoolin  (None, 2245, 32, 32)     0         
 g2D)                                                            
                                                                 
 conv2d_11 (Conv2D)          (None, 2245, 32, 32)      4128      
                                                           

<keras.callbacks.History at 0x1ae8061b088>

In [38]:
model.save('./models/model_conv_net.h5')

In [None]:
# Test: CNN
model_input = Input(shape=(X_train.shape[1],))
h = Embedding(len(word_index)+1, 64, input_length=max_len)(model_input)
h = Reshape((X_train.shape[1], 64, 1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,2))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,2))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Conv2D(filters=32, kernel_size=(2,2), padding='same', activation='relu')(h)
h = MaxPool2D((2,1))(h)
h = Flatten()(h)
# h = Dense(64, activation='relu')(h)
model_output = Dense(16, activation='softmax')(h)
model_cnn = Model(model_input, model_output)
model_cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_cnn.summary()
es = EarlyStopping(monitor='val_loss', mode='min', patience=8, restore_best_weights=True)
model_cnn.fit(X_train, y_train, epochs=200, batch_size=16, validation_data=(X_test, y_test), callbacks=[es])

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 4491)]            0         
                                                                 
 embedding_2 (Embedding)     (None, 4491, 64)          15050944  
                                                                 
 reshape_2 (Reshape)         (None, 4491, 64, 1)       0         
                                                                 
 conv2d_20 (Conv2D)          (None, 4491, 64, 32)      160       
                                                                 
 max_pooling2d_20 (MaxPoolin  (None, 2245, 32, 32)     0         
 g2D)                                                            
                                                                 
 conv2d_21 (Conv2D)          (None, 2245, 32, 32)      4128      
                                                           

In [None]:
# Test: CNN
model_input = Input(shape=(X_train.shape[1],))
h = Embedding(len(word_index)+1, 128, input_length=max_len)(model_input)
h = Reshape((X_train.shape[1], 128, 1))(h)
h = Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='tanh')(h)
h = MaxPool2D((2,2))(h)
h = Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='tanh')(h)
h = MaxPool2D((2,2))(h)
h = Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='tanh')(h)
h = MaxPool2D((2,2))(h)
h = Conv2D(filters=64, kernel_size=(2,2), padding='same', activation='tanh')(h)
h = MaxPool2D((2,2))(h)
h = Flatten()(h)
model_output = Dense(16, activation='softmax')(h)
model_cnn = Model(model_input, model_output)
model_cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_cnn.summary()
es = EarlyStopping(monitor='val_loss', mode='min', patience=8, restore_best_weights=True)
model_cnn.fit(X_train, y_train, epochs=200, batch_size=64, validation_data=(X_test, y_test), callbacks=[es])

In [None]:
# Define the model
model = Sequential()
model.add(Embedding(len(word_index) + 1, 128, input_length=max_len))
model.add(Bidirectional(LSTM(32, return_sequences=False)))
model.add(Dense(16, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
# Train the model
es = EarlyStopping(monitor='val_loss', mode='min', patience=8, restore_best_weights=True)
model.fit(X_train, y_train, epochs=200, batch_size=64, validation_data=(X_test, y_test), callbacks=[es])

In [None]:
# Define the model
model = Sequential()
model.add(Embedding(len(word_index) + 1, 128, input_length=max_len))
model.add(Bidirectional(GRU(64, return_sequences=True)))
model.add(BatchNormalization())
model.add(Bidirectional(GRU(64, return_sequences=True)))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
# Train the model
es = EarlyStopping(monitor='val_loss', mode='min', patience=8, restore_best_weights=True)
model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_test, y_test), callbacks=[es])

In [None]:
# Define the model
model = Sequential()
model.add(Embedding(len(word_index) + 1, 128, input_length=max_len))
model.add(Bidirectional(GRU(32, return_sequences=True)))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
# Train the model # GRU 32 with bidirectional layer, Embedding 128
es = EarlyStopping(monitor='val_loss', mode='min', patience=8, restore_best_weights=True)
model.fit(X_train, y_train, epochs=200, batch_size=64, validation_data=(X_test, y_test), callbacks=[es])

In [None]:
# Save the model
model.save('./models/model.h5')

In [None]:
# 사용방법 아래 참고

In [None]:
model = load_model('./models/model.h5')

In [None]:
# Example usage
text = "I'm fucking excited about this project! Hell Yeah!! Do it now! teala bitcoin doge"
prediction = predict_mbti(text)
prediction_sorted = sorted(prediction.items(), key=lambda x: x[1], reverse=True)
for k, v in prediction.items():
    print(f"{k}: {v:.4f}")
print("\nTop MBTI result: %s, %.2f%%" %(prediction_sorted[0][0], prediction_sorted[0][1]*100.))

In [None]:
# Example usage
text = "Dude Bro great really very interesting. emoji thumbs up me myself girl pretty no no well great"
prediction = predict_mbti(text)
prediction_sorted = sorted(prediction.items(), key=lambda x: x[1], reverse=True)
for k, v in prediction.items():
    print(f"{k}: {v:.4f}")
print("\nTop MBTI result: %s, %.2f%%" %(prediction_sorted[0][0], prediction_sorted[0][1]*100.))

In [None]:
# Define the translation function
def translate_text(text, source_language='ko', target_language='en'):
    url = 'https://translate.googleapis.com/translate_a/single?client=gtx&sl={}&tl={}&dt=t&q={}'
    response = requests.get(url.format(source_language, target_language, text)).json()
    try:
        translation = response[0][0][0]
    except (IndexError, TypeError):
        print(f"Translation failed for text: {text}")
        translation = ""
    return translation

# Define function to make predictions
def predict_mbti(text):
    texts = preprocess_text(pd.Series([text]))
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_len)
    prediction = model_cnn.predict(padded_sequences, verbose=0)[0]
    labels = label_dict.keys()
    result = {label: prediction[idx] for idx, label in enumerate(labels)}
    return result

# Define the main function to classify each user's MBTI type
def predict_mbti_kakaotalk_line_by_line(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    chat_room_name = lines[0].strip()
    saved_date = lines[1].strip()
    print(f"Chat Room: {chat_room_name}, Saved Date: {saved_date}")
    current_user = ''
    for line in lines[2:]:
        if '[' in line:
            current_user = re.search(r'\[(.*?)\]', line).group(1)
        else:
            continue
        translated_line = translate_text(line)
        prediction = predict_mbti(translated_line)
        top_k = sorted(prediction.items(), key=lambda x: x[1], reverse=True)[:3]
        print(f"{current_user}: {top_k}")


In [None]:
def predict_mbti_kakaotalk(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Get chat room name and saved date
    chat_room_name = lines[0].strip()
    saved_date = lines[1].strip()
    print(f"Chat Room: {chat_room_name}, Saved Date: {saved_date}")

    # Group messages by user
    user_messages = {}
    current_user = None
    for line in lines[2:]:
        if '[' in line:
            current_user = re.search(r'\[(.*?)\]', line).group(1)
            if current_user not in user_messages:
                user_messages[current_user] = []
        elif current_user is not None and line.strip() != '':
            user_messages[current_user].append(line.strip())

    # Translate and predict MBTI for each user
    mbti_results = {}
    for user, messages in user_messages.items():
        print(f"\nProcessing messages for user: {user}")
        print(f"Messages: {messages}")
        translated_messages = [translate_text(message) for message in messages]
        prediction = predict_mbti(' '.join(translated_messages))
        top_k = sorted(prediction.items(), key=lambda x: x[1], reverse=True)[:3]
        mbti_results[user] = top_k

    # Print MBTI results by user
    for user, results in mbti_results.items():
        print(f"{user}: {results}")

In [None]:
predict_mbti_kakaotalk('./KakaoTalk_20230504_1223_50_096_group.txt')

In [None]:
predict_mbti_kakaotalk('./KakaoTalk_20230504_1304_13_543_group.txt')

In [None]:
# (1) 성능 개선의 방법
# 전처리 고도화
# 모델 고도화
# Transformer, BERT (GPT알고리즘) 모형 사용하기
# 기존 학습되어 있는 언어모델 레이어 불러와서 사용하기

# (2) 카카오톡 대화 내용 넣기 (완료)
# 번역 (완료)
# 복수의 문장이 들어가서 MBTI 출력되게 구현하기 (완료)
# 정규화 개선해야함.