# 데모

## 라이브러리 import 및 설정

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from matplotlib import rcParams, pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout, Flatten
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
import warnings 
warnings.filterwarnings(action='ignore')

In [3]:
print(tf.__version__)

2.3.0


In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print('No GPU detected')

1 Physical GPUs, 1 Logical GPU


In [5]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

## GloVe 임베딩 로드

http://nlp.stanford.edu/data/glove.6B.zip 를 다운받아 `data_dir`에 압축을 푼다.

In [6]:
data_dir = Path('./open')
feature_dir = Path('./build/feature')
val_dir = Path('./ensemble/build/val')
tst_dir = Path('./ensemble/build/tst')
sub_dir = Path('./open/sub')
dirs = [feature_dir, val_dir, tst_dir, sub_dir]
for d in dirs:
    os.makedirs(d, exist_ok=True)

trn_file = data_dir / 'train_processed_500.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'
glove_file = './dataset/glove.6B/glove.6B.100d.txt'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [7]:
algo_name = 'lstm'
feature_name = 'glove'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

In [8]:
embeddings_index = {}
with open(glove_file, encoding = 'UTF8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
        
print(f'Found {len(embeddings_index)} word vectors.')

Found 400001 word vectors.


## 학습데이터 로드

In [9]:
train = pd.read_csv(trn_file, index_col=0)
train.head()

Unnamed: 0_level_0,text,author,length
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,He was almost choking. There was so much so mu...,3,235
1,Your sister asked for it I suppose,2,34
2,She was engaged one day as she walked in peru...,1,312
3,The captain was in the porch keeping himself c...,4,305
4,Have mercy gentlemen odin flung up his hands. ...,3,215


In [10]:
test = pd.read_csv(tst_file, index_col=0)
test.head()

Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


## Preprocessing

In [11]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)


def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)


stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "odin",
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [12]:
train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_stopwords)

In [13]:
idx_under_20 = train[train['length'] < 20].index
train = train.drop(idx_under_20)
train = train.reset_index()
train.describe()

Unnamed: 0,index,author,length
count,181013.0,181013.0,181013.0
mean,91514.587,2.0491,164.5379
std,53025.4182,1.2863,178.6952
min,0.0,0.0,20.0
25%,45490.0,1.0,65.0
50%,91441.0,2.0,113.0
75%,137458.0,3.0,189.0
max,183467.0,4.0,2454.0


In [14]:
trn = train['text'].values
tst = test['text'].values
y = train['author'].values
print(trn.shape, tst.shape, y.shape)

(181013,) (19617,) (181013,)


In [15]:
vectorizer = TextVectorization(max_tokens=100000, output_sequence_length=500)
text_ds = tf.data.Dataset.from_tensor_slices(trn).batch(128)
vectorizer.adapt(text_ds)

In [16]:
vectorizer.get_vocabulary()[:10]

['', '[UNK]', 'not', 'one', 'no', 'said', 'mr', 'upon', 'will', 'now']

In [17]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [18]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print(f"Converted {hits} words ({misses} misses)")

Converted 29920 words (17233 misses)


In [19]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=Constant(embedding_matrix),
    trainable=False,
)

<h2> HyperOpt 하이퍼 파라미터 튜닝 

In [20]:
from hyperopt import Trials, STATUS_OK, tpe, fmin, hp
from sklearn.model_selection import train_test_split


In [21]:
def data(batch_size, time_steps, trn, y):
    """
    function that returns data to be fed into objective function and model is trained on it subsequently.
    """
    BATCH_SIZE = batch_size
    TIME_STEPS = time_steps
    X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=.2, random_state=seed)
    return  X_trn, X_val, y_trn, y_val

In [22]:
search_space = {
    'batch_size': hp.choice('bs', [30,40,50,60,70]),
    'time_steps': hp.choice('ts', [30,50,60,80,90]),
    'embedding_dim' : 100,
    'vocab_size' : 10000,
    'lstm1_nodes': hp.choice('units_lsmt1', [64,80,100,128]),
    'lstm1_dropouts': hp.uniform('dos_lstm1',0,1),
    'lstm_layers': hp.choice('num_layers_lstm',[
        {
            'layers':'one', 
        },
        {
            'layers':'two',
            'lstm2_nodes': hp.choice('units_lstm2', [20,30,40,50]),
            'lstm2_dropouts': hp.uniform('dos_lstm2',0,1)  
        }
        ]),
    'dense_layers': hp.choice('num_layers_dense',[
        {
            'layers':'one'
        },
        {
            'layers':'two',
            'dense2_nodes': hp.choice('units_dense', [10,20,30,40])
        }
        ]),
    "lr": hp.loguniform('lr',np.log(0.01), np.log(0.2)),
    "epochs": 30,
    "optimizer": "Adam"
}

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])

In [23]:
def create_model_hypopt(params):
    """
    This method is called for each combination of parameter set to train the model and validate it against validation data
    to see all the results, from which best can be selected.
    """
    print("Trying params:",params)
    batch_size = params["batch_size"]
    time_steps = params["time_steps"]
    # For most cases preparation of data can be done once and used 'n' number of times in this method to train the model
    # but in this case we want to find optimal value for batch_size and time_steps too. So our data preparation has to be done
    # based on that. Hence calling it from here.
    
    X_trn, X_val, y_trn, y_val = train_test_split(trn, y, test_size=.2, random_state=seed)
    
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)

    int_sequences_input = Input(shape=(1,), dtype=tf.string)
    vectorized_sequences = vectorizer(int_sequences_input)
    embedded_sequences = embedding_layer(vectorized_sequences)
    x = Bidirectional(LSTM(params["lstm1_nodes"], 
                                dropout=params["lstm1_dropouts"],
                                #recurrent_dropout=params["lstm1_dropouts"], 
                                return_sequences=True,
                                kernel_initializer='random_uniform'))(embedded_sequences)
    if params["lstm_layers"]["layers"] == "two":
        x = Bidirectional(LSTM(params["lstm_layers"]["lstm2_nodes"], 
                 dropout=params["lstm_layers"]["lstm2_dropouts"]))(x)
    else:
        x = Flatten()(x)
    if params["dense_layers"]["layers"] == 'two':
        x = Dense(params["dense_layers"]["dense2_nodes"], activation='relu')(x)
    
    preds = Dense(n_class, activation="softmax")(x)
    model = Model(int_sequences_input, preds)
    
    '''
    lstm_model = Sequential()
    # (batch_size, timesteps, data_dim)
    lstm_model.add(Embedding(input_dim= params['vocab_size'],
                             weights = [embedding_matrix],trainable = False,
                             output_dim=params['embedding_dim']))
    lstm_model.add(Bidirectional(LSTM(params["lstm1_nodes"], 
                                dropout=params["lstm1_dropouts"],
                                recurrent_dropout=params["lstm1_dropouts"], 
                                stateful=True, return_sequences=True,
                                kernel_initializer='random_uniform')))
    if params["lstm_layers"]["layers"] == "two":
        lstm_model.add(Bidirectional(
            LSTM(params["lstm_layers"]["lstm2_nodes"], 
                 dropout=params["lstm_layers"]["lstm2_dropouts"])))
    else:
        lstm_model.add(Flatten())

    if params["dense_layers"]["layers"] == 'two':
        lstm_model.add(Dense(params["dense_layers"]["dense2_nodes"], activation='relu'))
    
    lstm_model.add(Dense(5, activation='softmax'))
    '''

    lr = params["lr"]
    epochs = params["epochs"]
    optimizer = Adam(lr=lr)
    

    model.compile(loss='categorical_crossentropy', optimizer=optimizer)  # binary_crossentropy
    history = model.fit(X_trn, to_categorical(y_trn), epochs=epochs, verbose=1, batch_size=batch_size,
                             validation_data=[X_val, to_categorical(y_val)],
                             callbacks=[es])
    val_error = np.amin(history.history['val_loss']) 
    print('Best validation error of epoch:', val_error)
    return {'loss': val_error, 'status': STATUS_OK, 'model': lstm_model} # if accuracy use '-' sign
    # return history, lstm_model

# Trails object let's you return and store extra information from objective function, which
# can be analysed later. Check "trails.trails" which returns all the list of dictionaries 
trials = Trials()
best = fmin(create_model_hypopt,
    space=search_space,
    algo=tpe.suggest, # type random.suggest to select param values randomly
    max_evals=20, # max number of evaluations you want to do on objective function
    trials=trials)

hyperparams = space_eval(search_space, best)
n_best = trials.best_trial['result']['model'].best_iteration_
params.update(hyperparams)
print(params)

Trying params:                                        
{'batch_size': 60, 'dense_layers': {'dense2_nodes': 20, 'layers': 'two'}, 'embedding_dim': 100, 'epochs': 30, 'lr': 0.168558372570222, 'lstm1_dropouts': 0.8530677411712584, 'lstm1_nodes': 100, 'lstm_layers': {'layers': 'two', 'lstm2_dropouts': 0.5492024034251537, 'lstm2_nodes': 30}, 'optimizer': 'Adam', 'time_steps': 90, 'vocab_size': 10000}
Epoch 1/30                                            
  0%|          | 0/20 [00:01<?, ?trial/s, best loss=?]

job exception:    Fail to find the dnn implementation.
	 [[{{node CudnnRNN}}]]
	 [[functional_1/bidirectional/forward_lstm/PartitionedCall]] [Op:__inference_train_function_15730]

Function call stack:
train_function -> train_function -> train_function




  0%|          | 0/20 [00:06<?, ?trial/s, best loss=?]


UnknownError:    Fail to find the dnn implementation.
	 [[{{node CudnnRNN}}]]
	 [[functional_1/bidirectional/forward_lstm/PartitionedCall]] [Op:__inference_train_function_15730]

Function call stack:
train_function -> train_function -> train_function


## Training

In [25]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [24]:
def get_model(**params):
    int_sequences_input = Input(shape=(1,), dtype=tf.string)
    vectorized_sequences = vectorizer(int_sequences_input)
    embedded_sequences = embedding_layer(vectorized_sequences)
    x = Bidirectional(LSTM(128, dropout=0.2, return_sequences=True))(embedded_sequences)
    x = Bidirectional(LSTM(128, dropout=0.2, return_sequences=True))(x)
    x = Bidirectional(LSTM(128, dropout=0.2,))(x)
    x = Dense(64,activation ="relu")(x)
    preds = Dense(n_class, activation="softmax")(x)
    model = Model(int_sequences_input, preds)
    
    # compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(learning_rate=.01))
    return model

In [None]:
def get_model_hypopt(params):
    """
    This method is called for each combination of parameter set to train the model and validate it against validation data
    to see all the results, from which best can be selected.
    """
    print("Trying params:",params)
    batch_size = params["batch_size"]
    time_steps = params["time_steps"]
    # For most cases preparation of data can be done once and used 'n' number of times in this method to train the model
    # but in this case we want to find optimal value for batch_size and time_steps too. So our data preparation has to be done
    # based on that. Hence calling it from here.
        
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)

    int_sequences_input = Input(shape=(1,), dtype=tf.string)
    vectorized_sequences = vectorizer(int_sequences_input)
    embedded_sequences = embedding_layer(vectorized_sequences)
    x = Bidirectional(LSTM(params["lstm1_nodes"], 
                                dropout=params["lstm1_dropouts"],
                                #recurrent_dropout=params["lstm1_dropouts"], 
                                return_sequences=True,
                                kernel_initializer='random_uniform'))(embedded_sequences)
    if params["lstm_layers"]["layers"] == "two":
        x = Bidirectional(LSTM(params["lstm_layers"]["lstm2_nodes"], 
                 dropout=params["lstm_layers"]["lstm2_dropouts"]))(x)
    else:
        x = Flatten()(x)
    if params["dense_layers"]["layers"] == 'two':
        x = Dense(params["dense_layers"]["dense2_nodes"], activation='relu')(x)
    
    preds = Dense(n_class, activation="softmax")(x)
    model = Model(int_sequences_input, preds)
    lr = params["lr"]
    epochs = params["epochs"]
    optimizer = Adam(lr=lr)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)  # binary_crossentropy
    
    return model

In [26]:
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)
    
    clf = get_model_hyopt(params) 
    clf.fit(trn[i_trn], 
            to_categorical(y[i_trn]),
            validation_data=(trn[i_val], to_categorical(y[i_val])),
            epochs=100,
            batch_size=512,
            callbacks=[es])
    p_val[i_val, :] = clf.predict(trn[i_val])
    p_tst += clf.predict(tst) / n_fold

training model for CV #1
Epoch 1/100


UnknownError:    Fail to find the dnn implementation.
	 [[{{node CudnnRNN}}]]
	 [[functional_3/bidirectional_2/forward_lstm_2/PartitionedCall]] [Op:__inference_train_function_32402]

Function call stack:
train_function -> train_function -> train_function


In [None]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')

In [None]:
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

## 시각화

In [None]:
clf.summary()

In [None]:
plot_model(clf)

## 제출 파일 생성

In [None]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

In [None]:
sub[sub.columns] = p_tst
sub.head()

In [None]:
sub.to_csv(sub_file)