In [None]:
!git clone https://github.com/jaaaamj0711/Coverletter-Helper.git

In [None]:
!pip install tensorflow

In [2]:
%cd Coverletter-Helper

/home/jupyterlab/DI_LAB/Hackerthon/Coverletter-Helper


In [2]:
import pandas as pd
import numpy as np
from string import punctuation
from sklearn.model_selection import train_test_split
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
f = open('dataset.txt','r')
df = f.read().splitlines()

In [None]:
# Unknown 값을 가진 샘플 제거 및 샘플 수 확인
headline = [n for n in df if n != "Unknown"]
print('노이즈값 제거 후 샘플의 개수 : {}'.format(len(headline)))

In [None]:
text = headline

# Tokenization
vocab = Counter()
for line in text:
    vocab.update(line.split())

# 단어 집합 크기 확인
vocab_size = len(vocab) + 1
print('단어 집합의 크기 : %d' % vocab_size)

In [27]:
# 인코딩
word_to_index = {word: i+1 for i, (word, _) in enumerate(vocab.most_common())}
index_to_word = {i+1: word for word, i in word_to_index.items()}

sequences = []
for line in text:
    encoded = [word_to_index[word] for word in line.split()]
    for i in range(1, len(encoded)):
        sequences.append(encoded[:i+1])

In [28]:
# Padding
max_len = max(len(s) for s in sequences)
sequences = [s + [0]*(max_len-len(s)) for s in sequences]

In [29]:
# Data split
sequences = torch.tensor(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

In [30]:
# Model
class TextModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, _) = self.rnn(embedded)
        return self.fc(output)

model = TextModel(vocab_size, 128, 128, vocab_size, 0)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [None]:
# Training
epochs = 24
for epoch in range(epochs):
    optimizer.zero_grad()
    output = model(X)
    output_dim = output.shape[-1]
    output = output.view(-1, output_dim)
    y = y.view(-1)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    print(f'Epoch: {epoch+1:02}, Loss: {loss.item():.4f}')

In [None]:
# Sentence Generator 정의
def sentence_generation(model, t, current_word, n): 
    init_word = current_word 
    sentence = ''
    for _ in range(n):
        encoded = t.texts_to_sequences([current_word])[0] 
        encoded = pad_sequences([encoded], maxlen=627, padding='pre') 
        result = model.predict_classes(encoded, verbose=0)
    
        for word, index in t.word_index.items(): 
            if index == result: 
                break 
        current_word = current_word + ' '  + word 
        sentence = sentence + ' ' + word 
  
    sentence = init_word + sentence
    return sentence

In [None]:
# 결과확인
print(sentence_generation(model, t, '개발', 3))
print(sentence_generation(model, t, '분석', 3))
print(sentence_generation(model, t, '데이터', 3))