# 모델

In [None]:
# problem 파일 경로 Desktop/project/code_similarity/open/code

# problem 폴더 안 예제 코드 확장자명 변경
import os
from tqdm import tqdm
import sys
import tensorflow as tf
import json
import datetime

# .txt 병합
import pandas as pd
import numpy as np
import sklearn

#keras model import
import keras
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model
from keras import Input, layers
from tensorflow.keras.layers import Embedding, LSTM, Input, Lambda,Dropout
from tensorflow.keras.callbacks import EarlyStopping
from keras import backend as K
from tensorflow.keras.optimizers import Adam,RMSprop
#model save & load
from keras.models import load_model

In [None]:
train_data = pd.read_csv("./drive/MyDrive/code_similarity/data/open_code_train.csv", index_col = 0, encoding='utf-8')

In [None]:
train_data.drop(['c1_problem_name','c2_problem_name'], axis = 1, inplace= True)
train_data

In [None]:
print(type(train_data['code1']))
print(type(train_data['code2']))
print(train_data.head())

In [None]:
test_data = pd.read_csv("./drive/MyDrive/code_similarity/data/open_code_test.csv", index_col = 0, encoding = 'utf-8')

In [None]:
#test_data = test_data.head()
test_data

## 토큰화

In [None]:
tokenizer = Tokenizer() # Tokenizer 객체 생성

In [None]:
tokenizer.fit_on_texts(train_data['code1']) # 단어 인덱스 구축
tokenizer.fit_on_texts(train_data['code2']) # 단어 인덱스 구축

In [None]:
sequences_1 = tokenizer.texts_to_sequences(train_data['code1']) # 문자열을 정수 인덱스의 리스트로 변환한다. 
sequences_2 = tokenizer.texts_to_sequences(train_data['code2'])

In [None]:
train_val = train_data['similar']
train_val.head()

In [None]:
vocab_size = len(tokenizer.word_index) + 1 # 패딩을 고려하여 +1
print(vocab_size)

In [None]:
#가장 긴 sequence의 길이
MAX_SEQUENCE_LENGTH = max([len(seq) for seq in sequences_1 + sequences_2])
ABR__SEQUENCE_LENGTH = round(sum(map(len, sequences_1))/len(sequences_1))
print('코드의 최대 길이 : {}'.format(max([len(seq) for seq in sequences_1 + sequences_2])))
print('코드의 평균길이 : {}'.format(sum(map(len, sequences_1))/len(sequences_1)))
print('코드의 평균길이 : %d'% ABR__SEQUENCE_LENGTH)

In [None]:
X_train_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
X_train_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
print(X_train_1.shape)
print(X_train_2.shape)
print(type(X_train_2))

In [None]:
# Model variables
batch_size = 512
epoch = 150
vocab_size = vocab_size
MAX_SEQUENCE_LENGTH = MAX_SEQUENCE_LENGTH
ABR__SEQUENCE_LENGTH = ABR__SEQUENCE_LENGTH
vocab_size = vocab_size
emb_output_dim = 64
DROPOUT_RATIO = 0.25

def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

In [None]:
# The visible layer
input_tensor1 = layers.Input(shape= (128,), dtype = 'int32', name = 'code1')
input_tensor2 = layers.Input(shape= (128,), dtype = 'int32', name = 'code2')

# Embedding layers
embedding_layer = layers.Embedding(input_dim = vocab_size,
                                   output_dim = emb_output_dim,
                                   input_length = MAX_SEQUENCE_LENGTH,
                                   mask_zero = True,
                                   trainable = False)

# Embedded of the inputs
encoded_1 = embedding_layer(input_tensor1)
encoded_2 = embedding_layer(input_tensor2)

# define lstm for sentence encode
encoded_LSTM1 = layers.LSTM(64, activation='tanh')(encoded_1)
encoded_LSTM2 = layers.LSTM(64, activation='tanh')(encoded_2)

#dropout layer
drop1 = layers.Dropout(DROPOUT_RATIO)(encoded_LSTM1)
drop2 = layers.Dropout(DROPOUT_RATIO)(encoded_LSTM2)

#함수형 API 모델 을 구성할 때 임의 의 Lambda표현식을 사용할 수 있도록 레이어 생성
main_malstm_distance = layers.Lambda(lambda x: exponent_neg_manhattan_distance(x[0], x[1]), output_shape=lambda x: (x[0][0], 1))([drop1,drop2])


#model
malstm = Model([input_tensor1, input_tensor2], outputs = main_malstm_distance)

#model compile 
malstm.compile(loss='mean_squared_logarithmic_error', optimizer=RMSprop(learning_rate=0.001), metrics=['accuracy'])

In [None]:
malstm.summary()

In [None]:
# early_stop
early_stopping = EarlyStopping(monitor="val_loss",patience = 5, mode="min")

In [None]:
history = malstm.fit([X_train_1,X_train_2],
                   train_val,
                   verbose=1,
                   callbacks=[early_stopping],
                   validation_split=0.2,
                   epochs=150 ,
                   batch_size = batch_size)

In [None]:
#model save
malstm.save('./drive/MyDrive/code_similarity/model_smilarity_MSLE_128_RE')

In [None]:
# Plot accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
# Plot loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [None]:
#test data 정수
tokenizer.fit_on_texts(test_data['code1']) # 단어 인덱스 구축
tokenizer.fit_on_texts(test_data['code2']) # 단어 인덱스 구축

test_sequences_1 = tokenizer.texts_to_sequences(test_data['code1']) # 문자열을 정수 인덱스의 리스트로 변환한다. 
test_sequences_2 = tokenizer.texts_to_sequences(test_data['code2'])

X_test_1 = pad_sequences(test_sequences_1, maxlen=128)
X_test_2 = pad_sequences(test_sequences_2, maxlen=128)

print(X_test_1.shape)
print(X_test_2.shape)
print(type(X_test_1))

In [None]:
model = load_model('./drive/MyDrive/code_similarity/model_smilarity_MSLE_128')

In [None]:
# 모델 추론
preds = model.predict([X_test_1, X_test_2])

In [None]:
#preds 값 확인
preds[:10]

In [None]:
threshold = 0.5
preds = np.where(preds >threshold, 1, 0)


In [None]:
# submission 저장
submission = pd.read_csv('./drive/MyDrive/code_similarity/data/submission.csv')
submission['similar'] = preds
print(submission['similar'].value_counts())
submission.to_csv('./drive/MyDrive/code_similarity/data/submission.csv', index=False)