In [None]:
import pandas as pd
import re
from konlpy.tag import Komoran
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import os

news_data = pd.read_csv('news.csv')
companies_data = pd.read_csv('company_names.csv')

dict_data = pd.read_excel('dict.xlsx', sheet_name=None)
stopwords = dict_data['불용어']['stopwords'].tolist()
hanja_changes = dict_data['한자'].set_index('hanja')['change'].to_dict()

def preprocess_text(text):
    for hanja, change in hanja_changes.items():
        text = text.replace(hanja, change)
    text = re.sub(r'[\u4e00-\u9fff]+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'[a-z0-9]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join(word for word in text.split() if word not in stopwords)
    return text

news_data['processed_content'] = news_data['content'].apply(preprocess_text)

komoran = Komoran()
news_data['nouns'] = news_data['processed_content'].apply(lambda x: [noun for noun in komoran.nouns(x) if len(noun) > 1])

company_names = companies_data['name'].tolist()

font_path = 'NanumGothic.ttf'
font_prop = fm.FontProperties(fname=font_path, size=12)
plt.rcParams['font.family'] = font_prop.get_name()

def visualize_keyword_network_by_date(company_name, news_data, output_dir):
    grouped = news_data.groupby('date')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for date, group in grouped:
        G = nx.Graph()
        all_nouns = []
        if any(company_name in content for content in group['processed_content']):
            for nouns in group[group['processed_content'].str.contains(company_name)]['nouns']:
                all_nouns.extend(nouns)
            noun_counts = pd.Series(all_nouns).value_counts()
            top_nouns = noun_counts.head(7).index.tolist()
            if len(top_nouns) < 3:
                print(f"Skipping")
                continue

            for noun in top_nouns:
                if noun != company_name:
                    G.add_edge(company_name, noun, weight=noun_counts[noun])

            if len(G.nodes) == 0:
                print(f"No keywords")
                continue

            plt.figure(figsize=(10, 10))
            pos = nx.spring_layout(G, k=0.2)

            node_colors = ['#F78C7B' if node == company_name else '#FAE18F' for node in G.nodes]
            font_colors = {node: 'black' if node == company_name else 'black' for node in G.nodes}

            nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=4000, alpha=0.9)
            nx.draw_networkx_edges(G, pos, edge_color='#D9D9D9', width=1.0, alpha=0.5)

            for node, (x, y) in pos.items():
                plt.text(
                    x, y, node, fontsize=14, fontweight='bold', color=font_colors[node],
                    fontproperties=font_prop,
                    horizontalalignment='center', verticalalignment='center'
                )

            plt.axis('off')

            output_path = os.path.join(output_dir, f'{company_name}_{date}.png')
            plt.savefig(output_path, format='png', bbox_inches='tight', dpi=300)
            plt.close()
            print(f"[{date}] Saved: {output_path}")

output_directory = './keyword_networks'
for company_name in companies_data['name']:
    visualize_keyword_network_by_date(company_name, news_data, output_directory)