In [52]:
import pandas as pd
import numpy as np
from string import punctuation

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

import os

In [53]:
df = pd.read_csv(f'{os.getcwd()}/data/article_contents/environment_8.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,All SectionsSubscribe Now 54°F \t\t\t\t\t\t\tM...
1,1,"Environment | \t\t\tEPA bans asbestos, a deadl..."
2,2,"The EPA announced Monday, March 18, a compreh..."
3,3,"AP PhotoAndrew Harnik, File\t\tBy Matthew Dal..."
4,4,"| UPDATED: March 18, 2024 at 12:15 p"


In [54]:
print('열의 개수: ', len(df.columns))
print(df.columns)

열의 개수:  2
Index(['Unnamed: 0', '0'], dtype='object')


In [55]:
print(df['0'].isnull().values.any())

False


In [56]:
headline = []
# 헤드라인의 값들을 리스트로 저장
headline.extend(list(df['0']))
headline[:]

['All SectionsSubscribe Now 54°F \t\t\t\t\t\t\tMonday, March 18th 2024\t\t\t\t\t\t\t\t\tDigital Replica Edition Home PageClose MenuNewsLatest HeadlinesColorado NewsPoliticsElection 2024Crime and Public SafetyCourtsNational NewsWorld NewsEducationHealthEnvironmentTransportationHousingNews ObituariesPhotosVideoYour HubWeatherSportsSportsSports ColumnistsDenver BroncosColorado RockiesDenver NuggetsColorado AvalancheColorado RapidsCollege SportsPrepsBettingGolfBoxing  MMASports on TVRadioSports PodcastsOlympicsBusinessBusinessReal EstateAirlinesEconomyEnergyRetailTechnologyThe KnowThe KnowFood and DrinkArtCultureMoviesTV & StreamingMusicTheaterTravelFamily FriendlyBarsBeerOutdoorsOutdoorsHikingFall ColorsCyclingFitnessRunningCampingFishingHuntingWater SportsSkiingSnowboardingWinter SportsOpinionOpinionEditorialsColumnistsLettersCartoonsEndorsementsObituariesObituariesNews ObituariesPlace an ObituaryThings To DoThings To DoEvent CalendarTelevision ListingsComicsGamesHoroscopesAsk AmyHome & 

In [57]:
print('총 샘플의 개수 : {}'.format(len(headline)))

총 샘플의 개수 : 43


In [58]:
headline = [word for word in headline if word != "Unknown"]
print("노이즈값 제거 후 샘플의 개수 : {}".format(len(headline)))

노이즈값 제거 후 샘플의 개수 : 43


In [59]:
headline[:5]

['All SectionsSubscribe Now 54°F \t\t\t\t\t\t\tMonday, March 18th 2024\t\t\t\t\t\t\t\t\tDigital Replica Edition Home PageClose MenuNewsLatest HeadlinesColorado NewsPoliticsElection 2024Crime and Public SafetyCourtsNational NewsWorld NewsEducationHealthEnvironmentTransportationHousingNews ObituariesPhotosVideoYour HubWeatherSportsSportsSports ColumnistsDenver BroncosColorado RockiesDenver NuggetsColorado AvalancheColorado RapidsCollege SportsPrepsBettingGolfBoxing  MMASports on TVRadioSports PodcastsOlympicsBusinessBusinessReal EstateAirlinesEconomyEnergyRetailTechnologyThe KnowThe KnowFood and DrinkArtCultureMoviesTV & StreamingMusicTheaterTravelFamily FriendlyBarsBeerOutdoorsOutdoorsHikingFall ColorsCyclingFitnessRunningCampingFishingHuntingWater SportsSkiingSnowboardingWinter SportsOpinionOpinionEditorialsColumnistsLettersCartoonsEndorsementsObituariesObituariesNews ObituariesPlace an ObituaryThings To DoThings To DoEvent CalendarTelevision ListingsComicsGamesHoroscopesAsk AmyHome & 

In [60]:
def repreprocessing(raw_sentence):
    preproceseed_sentence = raw_sentence.encode("utf8").decode("ascii", "ignore")
    # 구두점 제거와 동시에 소문자화
    return "".join(word for word in preproceseed_sentence if word not in punctuation).lower()

preprocessed_headline = [repreprocessing(x) for x in headline]
preprocessed_headline[:5]

['all sectionssubscribe now 54f \t\t\t\t\t\t\tmonday march 18th 2024\t\t\t\t\t\t\t\t\tdigital replica edition home pageclose menunewslatest headlinescolorado newspoliticselection 2024crime and public safetycourtsnational newsworld newseducationhealthenvironmenttransportationhousingnews obituariesphotosvideoyour hubweathersportssportssports columnistsdenver broncoscolorado rockiesdenver nuggetscolorado avalanchecolorado rapidscollege sportsprepsbettinggolfboxing  mmasports on tvradiosports podcastsolympicsbusinessbusinessreal estateairlineseconomyenergyretailtechnologythe knowthe knowfood and drinkartculturemoviestv  streamingmusictheatertravelfamily friendlybarsbeeroutdoorsoutdoorshikingfall colorscyclingfitnessrunningcampingfishinghuntingwater sportsskiingsnowboardingwinter sportsopinionopinioneditorialscolumnistsletterscartoonsendorsementsobituariesobituariesnews obituariesplace an obituarythings to dothings to doevent calendartelevision listingscomicsgameshoroscopesask amyhome  gard

In [61]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_headline)
vocab_size = len(tokenizer.word_index) + 1
print("단어 집합의 크기 : %d" % vocab_size)

단어 집합의 크기 : 659


In [62]:
sequences = list()

for sentence in preprocessed_headline:
    print(sentence)
    # 각 샘플에 대한 정수 인코딩
    encoded = tokenizer.texts_to_sequences([sentence])[0] 
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

sequences[:11]

all sectionssubscribe now 54f 							monday march 18th 2024									digital replica edition home pageclose menunewslatest headlinescolorado newspoliticselection 2024crime and public safetycourtsnational newsworld newseducationhealthenvironmenttransportationhousingnews obituariesphotosvideoyour hubweathersportssportssports columnistsdenver broncoscolorado rockiesdenver nuggetscolorado avalanchecolorado rapidscollege sportsprepsbettinggolfboxing  mmasports on tvradiosports podcastsolympicsbusinessbusinessreal estateairlineseconomyenergyretailtechnologythe knowthe knowfood and drinkartculturemoviestv  streamingmusictheatertravelfamily friendlybarsbeeroutdoorsoutdoorshikingfall colorscyclingfitnessrunningcampingfishinghuntingwater sportsskiingsnowboardingwinter sportsopinionopinioneditorialscolumnistsletterscartoonsendorsementsobituariesobituariesnews obituariesplace an obituarythings to dothings to doevent calendartelevision listingscomicsgameshoroscopesask amyhome  gardenfree and cheapca

[[72, 242],
 [72, 242, 73],
 [72, 242, 73, 243],
 [72, 242, 73, 243, 37],
 [72, 242, 73, 243, 37, 38],
 [72, 242, 73, 243, 37, 38, 108],
 [72, 242, 73, 243, 37, 38, 108, 29],
 [72, 242, 73, 243, 37, 38, 108, 29, 109],
 [72, 242, 73, 243, 37, 38, 108, 29, 109, 74],
 [72, 242, 73, 243, 37, 38, 108, 29, 109, 74, 110],
 [72, 242, 73, 243, 37, 38, 108, 29, 109, 74, 110, 111]]

In [63]:
index_to_word = {}
for key, value in tokenizer.word_index.items(): # 인덱스를 단어로 바꾸기 위해 index_to_word를 생성
    index_to_word[value] = key

print("빈도수 상위 582번 단어 : {}".format(index_to_word[582]))

빈도수 상위 582번 단어 : travelers


In [64]:
max_len = max(len(l) for l in sequences)
print("샘플의 최대 길이 : {}".format(max_len))

샘플의 최대 길이 : 320


In [65]:
sequences = pad_sequences(sequences, maxlen=max_len, padding="pre")
print(sequences[:3])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0 

In [66]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

print(X[:3])

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0 

In [67]:
print(y[:3])

[242  73 243]


In [68]:
y = to_categorical(y, num_classes=vocab_size)

# Make the model

In [69]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

In [70]:
embedding_dim = 10
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=2)

Epoch 1/200


45/45 - 13s - 287ms/step - accuracy: 0.0272 - loss: 6.3503
Epoch 2/200
45/45 - 11s - 242ms/step - accuracy: 0.0307 - loss: 6.0247
Epoch 3/200
45/45 - 10s - 221ms/step - accuracy: 0.0258 - loss: 5.9614
Epoch 4/200
45/45 - 11s - 236ms/step - accuracy: 0.0321 - loss: 5.9415
Epoch 5/200
45/45 - 11s - 242ms/step - accuracy: 0.0251 - loss: 5.9177
Epoch 6/200
45/45 - 11s - 240ms/step - accuracy: 0.0328 - loss: 5.8724
Epoch 7/200
45/45 - 11s - 240ms/step - accuracy: 0.0272 - loss: 5.8164
Epoch 8/200
45/45 - 11s - 239ms/step - accuracy: 0.0342 - loss: 5.7053
Epoch 9/200
45/45 - 11s - 235ms/step - accuracy: 0.0328 - loss: 5.5608
Epoch 10/200
45/45 - 11s - 238ms/step - accuracy: 0.0293 - loss: 5.4143
Epoch 11/200
45/45 - 11s - 237ms/step - accuracy: 0.0363 - loss: 5.2752
Epoch 12/200
45/45 - 11s - 235ms/step - accuracy: 0.0432 - loss: 5.1239
Epoch 13/200
45/45 - 11s - 236ms/step - accuracy: 0.0579 - loss: 4.9612
Epoch 14/200
45/45 - 11s - 236ms/step - accuracy: 0.0662 - loss: 4.8029
Epoch 15/200


<keras.src.callbacks.history.History at 0x7f95735731f0>

In [71]:
def sentence_generation(model, tokenizer, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word
    sentence = ''

    # n번 반복
    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=max_len-1, padding="pre")

        # 입력한 X(현재 단어)에 대해서 y를 예측하고 y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items(): 
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence

In [1]:
print(sentence_generation(model, tokenizer, "We need to save our environment", 35))

NameError: name 'sentence_generation' is not defined

In [73]:
print(sentence_generation(model, tokenizer, "Trump, Donald J", 30))

Trump, Donald J the epa banned asbestos in 1989 but the rule was largely overturned by a 1991 court of appeals decision that weakened the epas authority under tsca to address risks to
