In [None]:
import pandas as pd
import re
from konlpy.tag import Komoran
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import ast
import os

df = pd.read_csv('news.csv')
df = df.drop(columns=['url', 'title', 'author', 'press'])

synonyms_df = pd.read_csv('Synonyms_Dict.csv')
synonyms_df['Synonyms'] = synonyms_df['Synonyms'].apply(ast.literal_eval)
synonym_to_word_map = {}
for index, row in synonyms_df.iterrows():
    for synonym in row['Synonyms']:
        synonym_to_word_map[synonym] = row['word']

dict_data = pd.read_excel('dict.xlsx', sheet_name=None)
stopwords = dict_data['불용어']['stopwords'].tolist()
hanja_changes = dict_data['한자'].set_index('hanja')['change'].to_dict()

def preprocess_text(text):
    for hanja, change in hanja_changes.items():
        text = text.replace(hanja, change)
    text = re.sub(r'[\u4e00-\u9fff]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[a-z0-9]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join(word for word in text.split() if word not in stopwords)
    return text

df['processed_content'] = df['content'].apply(preprocess_text)

def replace_synonyms(text, synonym_map):
    words = text.split()
    replaced_words = [synonym_map.get(word, word) for word in words]
    return ' '.join(replaced_words)

df['processed_content'] = df['processed_content'].apply(lambda x: replace_synonyms(x, synonym_to_word_map))

komoran = Komoran()
df['tokens'] = df['processed_content'].apply(lambda x: komoran.morphs(x))

def extract_nouns(tokens):
    nouns = [word for word, pos in komoran.pos(" ".join(tokens)) if pos in ('NNG', 'NNP')]
    filtered_nouns = [word for word in nouns if len(word) >= 2 and word not in stopwords]
    return filtered_nouns

df['nouns'] = df['tokens'].apply(extract_nouns)

senti_lex = pd.read_csv('SentiWord_Dict.csv')
positive_words = set(senti_lex[senti_lex['polarity'] > 0]['word'])
negative_words = set(senti_lex[senti_lex['polarity'] < 0]['word'])

# output_dir = "wordclouds"
# os.makedirs(output_dir, exist_ok=True)
def generate_datewise_wordcloud(df, sentiment_words, title_prefix, color, top_n=20):
    grouped = df.groupby('date')['nouns'].apply(lambda x: [word for words in x for word in words])
    for date, words in grouped.items():
        word_counts = Counter([word for word in words if word in sentiment_words])
        top_words = dict(word_counts.most_common(top_n))
        if top_words:
            wc = WordCloud(
                font_path='NanumGothic.ttf',
                background_color="white",
                colormap=color,
                width=700,
                height=500,
                prefer_horizontal=1.0 
            )
            wc.generate_from_frequencies(top_words)
            file_path = os.path.join(output_dir, f"{title_prefix}_WordCloud_{date}.png")
            wc.to_file(file_path)

generate_datewise_wordcloud(df, positive_words, "Positive", "Blues")

generate_datewise_wordcloud(df, negative_words, "Negative", "Reds")