In [43]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku
# set seeds for reproducability
from numpy.random import seed
import tensorflow as tf
tf.random.set_seed(2)

seed(1)
from tensorflow.keras.utils import to_categorical
import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [44]:
import tkinter as tk

def display_generated_text(text):
    root = tk.Tk()
    text_widget = tk.Text(root)
    text_widget.insert(tk.END, text)
    text_widget.pack()
    root.mainloop()

In [45]:
import os
import pandas as pd

# 修改后的文件夹路径
curr_dir = "C:/Users/ASUS/OneDrive/桌面/cd/archive/"
all_headlines = []

# 遍历文件夹内的所有文件
for filename in os.listdir(curr_dir):
    # 检查文件名是否包含 'Articles'
    if 'Articles' in filename:
        # 读取 CSV 文件
        article_df = pd.read_csv(curr_dir + filename)
        # 将文章标题添加到列表中
        all_headlines.extend(list(article_df.headline.values))
         # 如果只想处理一个文件，就在这里终止循环

# 移除值为 "Unknown" 的标题
all_headlines = [h for h in all_headlines if h != "Unknown"]

# 输出标题数量
print(len(all_headlines))


8603


In [46]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['finding an expansive view  of a forgotten people in niger',
 'and now  the dreaded trump curse',
 'venezuelas descent into dictatorship',
 'stain permeates basketball blue blood',
 'taking things for granted',
 'the caged beast awakens',
 'an everunfolding story',
 'oreilly thrives as settlements add up',
 'mouse infestation',
 'divide in gop now threatens trump tax plan']

In [47]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[391, 17],
 [391, 17, 5166],
 [391, 17, 5166, 523],
 [391, 17, 5166, 523, 4],
 [391, 17, 5166, 523, 4, 2],
 [391, 17, 5166, 523, 4, 2, 1601],
 [391, 17, 5166, 523, 4, 2, 1601, 134],
 [391, 17, 5166, 523, 4, 2, 1601, 134, 5],
 [391, 17, 5166, 523, 4, 2, 1601, 134, 5, 1951],
 [7, 57]]

In [48]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [49]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 23, 10)            112650    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 11265)             1137765   
Total params: 1,294,815
Trainable params: 1,294,815
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.fit(predictors, label, epochs=101, verbose=2)

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        probabilities = model.predict(token_list, verbose=0)

        # 使用argmax获取最可能的类别
        predicted = np.argmax(probabilities, axis=-1)
        
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text

In [None]:
import tkinter as tk

def submit_text():
    global text
    user_input = entry.get()
    text = user_input  
    label.config(text="article :" + text)
    with open("test_article.txt", "w") as file:
        file.write(text)
    print("read the article success!")

window = tk.Tk()
window.geometry("500x300") 
entry = tk.Entry(window)
entry.pack()

button = tk.Button(window, text="submit", command=submit_text)
button.pack()
label = tk.Label(window)
label.pack()
window.mainloop()

In [None]:
import simi 
import sentiment_keywords as sk
print(simi.result)
midght_pos_words = ' '.join(sk.pos_words)
midght_neg_words = ' '.join(sk.neg_words)
pos_3words=sk.pos_words[:3]
neg_3words=sk.neg_words[:3]
print(f'the might positive words are : \n{midght_pos_words}')
print(f'the might negative words are : \n{midght_neg_words}')


success finding!
the most high similarity of the article is: 0.804
the might positive words are : 
wonderful enjoy excellent beautiful fresh enjoyed impressive typical national delightful worth ideal recommend grown nice romantic traditional stark imagine belief lucky spiritual challenging favorite ensemble
the might negative words are : 
bad worst terrible sad late average difficult hard tomb public dead heavy


In [None]:
import TF_idf
import sentiment_keywords as sk
output1 = ' '.join(TF_idf.top_2_keywords)
output2 = ' '.join(TF_idf.top_4_keywords)
output3 = ' '.join(TF_idf.top_6_keywords)
output4 = ' '.join(TF_idf.top_8_keywords)
output5 = ' '.join(TF_idf.top_10_keywords)
pos_sentiment = ' '.join(pos_3words)
neg_sentiment = ' '.join(neg_3words)

In [None]:
print (generate_text(f"{output1}", 6, model, max_sequence_len))
print (generate_text(f"{output2}", 6, model, max_sequence_len))
print (generate_text(f"{output3}", 6, model, max_sequence_len))
print (generate_text(f"{output4}", 6, model, max_sequence_len))
print (generate_text(f"{output5}", 6, model, max_sequence_len))

temple like trump so was us next may
city beijing a liberal party on the rise
beijing park street coulter dont dont tell empty
china sun new york city is a second
past people expect to the healthiest way to


In [None]:
sentence= generate_text(f"{output3}", 6, model, max_sequence_len)

In [None]:
import openai
from IPython.display import Markdown

openai.api_key = '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'
# MODEL_NAME = "gpt-4"
MODEL_NAME = "gpt-3.5-turbo"

def test_openai_api(question):
    rsp = openai.ChatCompletion.create(
    model = MODEL_NAME,
    messages=[
            {"role": "system", "content": ""},
            {"role": "user", "content": question}
        ]       
    )
    
    # 返回内容
    display(Markdown(rsp['choices'][0]['message']['content']))

pos_sentiment = midght_pos_words[:3]
neg_sentiment = midght_neg_words[:3]
question = f'Please write a shot comment on an article abou {text} '
test_openai_api(question)


Beijing's commitment to preserving its rich historical and cultural heritage while embracing modern development is truly impressive. The city's urban planning efforts showcase a seamless blend of ancient architecture and contemporary structures, creating a picturesque cityscape. The extensive parks and scenic spots not only enhance the city's beauty but also provide citizens and visitors with spaces for leisure and appreciation of Beijing's civilization. Furthermore, the emphasis on citizen conduct and volunteer service showcases the city's dedication to building a harmonious and inclusive society. Beijing's efforts in recruiting and training volunteers demonstrate its commitment to providing excellent service to its residents. Overall, Beijing's unique charm and commitment to civilization make it a city worth exploring and appreciating.

In [None]:
import openai
from IPython.display import Markdown
test_article = open('test_article.txt', 'r').read()
openai.api_key = "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"
response = openai.Completion.create(
    engine='text-davinci-002',
    prompt=f'Please write a comment on the article about {sentence} using words: {pos_sentiment}  ',
    max_tokens=300
)

generated_text = response.choices[0].text.strip()
display_generated_text(generated_text)

In [None]:
question = f'Please write a shot comment on an article about {sentence} using words : {neg_sentiment}'
test_openai_api(question)

This article is a bad depiction of the Beijing Park Street culture, as it doesn't provide any meaningful insights and just leaves readers with empty information.