In [4]:
import pandas as pd
import numpy as np
from string import punctuation

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

import os

In [5]:
df = pd.read_csv(f'{os.getcwd()}/data/article_contents/environment_24.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,Skip to primary navigation Skip to main conte...
1,1,Please donate or sponsor today
2,2,Coastal ReviewA Daily News Service of the Nort...
3,3,Marine FisheriesStripped away: Wetlands left ...
4,4,Photo: Jennifer AllenThe state’s top environm...


In [6]:
print('열의 개수: ', len(df.columns))
print(df.columns)

열의 개수:  2
Index(['Unnamed: 0', '0'], dtype='object')


In [7]:
print(df['0'].isnull().values.any())

False


In [8]:
headline = []
# 헤드라인의 값들을 리스트로 저장
headline.extend(list(df['0']))
headline[:]

[' Skip to primary navigation Skip to main content Skip to footer We can’t do this without you! Enjoying Coastal Review is free',
 ' Please donate or sponsor today',
 'Coastal ReviewA Daily News Service of the North Carolina Coastal FederationMENUMENUHomeNewsTopicsNews & FeaturesBeach & Inlet ManagementClimate ChangeCoastal PolicyEducationEnergyHabitat RestorationLegislatureNews BriefsPublic HealthStormwaterTerminal GroinsTransportationRecent HeadlinesBiser urges environmental commission to hear PFAS rulesAnglers: Reporting law puts burden on them, unenforceableBiden commits $3B to replace lead water pipes nationwideCommission adopts amended rule for Jockey’s RidgeOregon Inlet Fishing Center cuts ribbon for new buildingMore ArticlesOur CoastTopicsOur CoastCulture & HistoryFoodPeoplePlacesWildlife & NatureRecent HeadlinesHatteras museum to reopen, Beaufort boat show ahead‘Save Our Sand Dunes’ recalls fight to save Jockey’s RidgeGarden tips everyone in coastal North Carolina should knowS

In [9]:
print('총 샘플의 개수 : {}'.format(len(headline)))

총 샘플의 개수 : 55


In [10]:
headline = [word for word in headline if word != "Unknown"]
print("노이즈값 제거 후 샘플의 개수 : {}".format(len(headline)))

노이즈값 제거 후 샘플의 개수 : 55


In [11]:
headline[:5]

[' Skip to primary navigation Skip to main content Skip to footer We can’t do this without you! Enjoying Coastal Review is free',
 ' Please donate or sponsor today',
 'Coastal ReviewA Daily News Service of the North Carolina Coastal FederationMENUMENUHomeNewsTopicsNews & FeaturesBeach & Inlet ManagementClimate ChangeCoastal PolicyEducationEnergyHabitat RestorationLegislatureNews BriefsPublic HealthStormwaterTerminal GroinsTransportationRecent HeadlinesBiser urges environmental commission to hear PFAS rulesAnglers: Reporting law puts burden on them, unenforceableBiden commits $3B to replace lead water pipes nationwideCommission adopts amended rule for Jockey’s RidgeOregon Inlet Fishing Center cuts ribbon for new buildingMore ArticlesOur CoastTopicsOur CoastCulture & HistoryFoodPeoplePlacesWildlife & NatureRecent HeadlinesHatteras museum to reopen, Beaufort boat show ahead‘Save Our Sand Dunes’ recalls fight to save Jockey’s RidgeGarden tips everyone in coastal North Carolina should knowS

In [12]:
def repreprocessing(raw_sentence):
    preproceseed_sentence = raw_sentence.encode("utf8").decode("ascii", "ignore")
    # 구두점 제거와 동시에 소문자화
    return "".join(word for word in preproceseed_sentence if word not in punctuation).lower()

preprocessed_headline = [repreprocessing(x) for x in headline]
preprocessed_headline[:5]

[' skip to primary navigation skip to main content skip to footer we cant do this without you enjoying coastal review is free',
 ' please donate or sponsor today',
 'coastal reviewa daily news service of the north carolina coastal federationmenumenuhomenewstopicsnews  featuresbeach  inlet managementclimate changecoastal policyeducationenergyhabitat restorationlegislaturenews briefspublic healthstormwaterterminal groinstransportationrecent headlinesbiser urges environmental commission to hear pfas rulesanglers reporting law puts burden on them unenforceablebiden commits 3b to replace lead water pipes nationwidecommission adopts amended rule for jockeys ridgeoregon inlet fishing center cuts ribbon for new buildingmore articlesour coasttopicsour coastculture  historyfoodpeopleplaceswildlife  naturerecent headlineshatteras museum to reopen beaufort boat show aheadsave our sand dunes recalls fight to save jockeys ridgegarden tips everyone in coastal north carolina should knowsunset beach a 

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_headline)
vocab_size = len(tokenizer.word_index) + 1
print("단어 집합의 크기 : %d" % vocab_size)

단어 집합의 크기 : 735


In [14]:
sequences = list()

for sentence in preprocessed_headline:
    print(sentence)
    # 각 샘플에 대한 정수 인코딩
    encoded = tokenizer.texts_to_sequences([sentence])[0] 
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

sequences[:11]

 skip to primary navigation skip to main content skip to footer we cant do this without you enjoying coastal review is free
 please donate or sponsor today
coastal reviewa daily news service of the north carolina coastal federationmenumenuhomenewstopicsnews  featuresbeach  inlet managementclimate changecoastal policyeducationenergyhabitat restorationlegislaturenews briefspublic healthstormwaterterminal groinstransportationrecent headlinesbiser urges environmental commission to hear pfas rulesanglers reporting law puts burden on them unenforceablebiden commits 3b to replace lead water pipes nationwidecommission adopts amended rule for jockeys ridgeoregon inlet fishing center cuts ribbon for new buildingmore articlesour coasttopicsour coastculture  historyfoodpeopleplaceswildlife  naturerecent headlineshatteras museum to reopen beaufort boat show aheadsave our sand dunes recalls fight to save jockeys ridgegarden tips everyone in coastal north carolina should knowsunset beach a sweet spot

[[98, 2],
 [98, 2, 127],
 [98, 2, 127, 231],
 [98, 2, 127, 231, 98],
 [98, 2, 127, 231, 98, 2],
 [98, 2, 127, 231, 98, 2, 232],
 [98, 2, 127, 231, 98, 2, 232, 233],
 [98, 2, 127, 231, 98, 2, 232, 233, 98],
 [98, 2, 127, 231, 98, 2, 232, 233, 98, 2],
 [98, 2, 127, 231, 98, 2, 232, 233, 98, 2, 234],
 [98, 2, 127, 231, 98, 2, 232, 233, 98, 2, 234, 36]]

In [15]:
index_to_word = {}
for key, value in tokenizer.word_index.items(): # 인덱스를 단어로 바꾸기 위해 index_to_word를 생성
    index_to_word[value] = key

print("빈도수 상위 582번 단어 : {}".format(index_to_word[5]))

빈도수 상위 582번 단어 : water


In [16]:
max_len = max(len(l) for l in sequences)
print("샘플의 최대 길이 : {}".format(max_len))

샘플의 최대 길이 : 178


In [17]:
sequences = pad_sequences(sequences, maxlen=max_len, padding="pre")
print(sequences[:3])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0  98   2]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0

In [18]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

print(X[:3])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0  98]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0

In [19]:
print(y[:3])

[  2 127 231]


In [20]:
y = to_categorical(y, num_classes=vocab_size)

# Make the model

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

In [22]:
embedding_dim = 10
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)

Epoch 1/200


KeyboardInterrupt: 

In [None]:
model.save("LSTM.keras")

In [23]:
import tensorflow as tf
model = tf.keras.models.load_model("./model/LSTM.keras")

In [24]:
def sentence_generation(model, tokenizer, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word
    sentence = ''

    # n번 반복
    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=max_len-1, padding="pre")

        # 입력한 X(현재 단어)에 대해서 y를 예측하고 y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items(): 
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence

In [27]:
print(sentence_generation(model, tokenizer, "I am Kimminsup.", 35))

I am Kimminsup. to freelearn how you can be in the sponsor spotlightbisers letter to the commission was announced thursday along with her response to a letter from the north carolina chamber president dated april 22 and asking


In [26]:
print(sentence_generation(model, tokenizer, "Trump, Donald J", 30))

Trump, Donald J to freelearn how you can be in the sponsor spotlightbisers letter to the commission was announced thursday along with her response to a letter from the north carolina chamber president
