In [1]:
pip install pandas scikit-learn matplotlib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import matplotlib as mpl

# 폰트 설정
mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False

# 분석할 파일 목록
file_paths = {
    "이준석": "lee_junseok_tagged_filtered_ver2.csv",
    "이재명": "lee_jaemyung_tagged_filtered.csv",
    "김문수": "kim_moonsu_tagged_filtered.csv"
}


documents = {}

# 파일별로 텍스트 추출 및 불용어 제거
for name, path in file_paths.items():
    df = pd.read_csv(path)
    docs = []
    for sentence in df['tagged'].dropna():
        tokens = []
        for token in sentence.split():
            if '(' in token and ')' in token:
                word = token.rsplit("(", 1)[0]
        docs.append(" ".join(tokens))
    documents[name] = " ".join(docs)

# TF-IDF 분석
corpus = list(documents.values())
names = list(documents.keys())
vectorizer = TfidfVectorizer(max_features=100)
X = vectorizer.fit_transform(corpus)
df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=names)

# 시각화
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

for i, name in enumerate(df_tfidf.index):
    top_terms = df_tfidf.loc[name].sort_values(ascending=False).head(10)
    axes[i].barh(top_terms.index[::-1], top_terms.values[::-1])
    axes[i].set_title(f"{name} - TF-IDF 상위 단어")
    axes[i].set_xlabel("TF-IDF 점수")

plt.tight_layout()
plt.show()


KeyboardInterrupt: 