In [None]:
!pip install --upgrade --force-reinstall numpy
!pip install bertopic gensim scikit-learn sentence-transformers tqdm openpyxl

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from bertopic import BERTopic
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from sentence_transformers import SentenceTransformer
from google.colab import files

In [None]:
from google.colab import files
uploaded = files.upload()

import io
import pandas as pd

for filename in uploaded:
    if filename.endswith(".xlsx"):
        df = pd.read_excel(io.BytesIO(uploaded[filename]))
    elif filename.endswith(".csv"):
        df = pd.read_csv(io.BytesIO(uploaded[filename]))
    else:
        raise ValueError("지원되지 않는 파일 형식입니다. .xlsx 또는 .csv를 사용하세요.")

In [None]:
stopwords = set([
    '그', '이', '저', '것', '등', '때', '중', '누구', '무엇', '하다', '되다', '있다', '없다', '받다',
    '가다', '오다', '보다', '주다', '말하다', '들다', '계속', '이제', '지금', '예전', '요즘', '거의',
    '좀', '많이', '더', '다시', '별로', '그리고', '그래서', '그런데', '하지만', '그러니까', '때문',
    '이런', '그런', '어떤', '같다', '나', '내', '너', '우리', '당신', '그녀', '그들', '사람', '모두',
    '여기', '저기', '거기', '안', '밖', '위', '아래', '그곳', '방향', '도', '는', '만', '과', '와',
    '보다', '까지', '부터', '으로', '에게', '이랑', '밖에', '조차', '정도', '경우', '내용', '문제',
    '이야기', '모습', '상황', '자신', '음', '응', '오', '어휴', '허', '아이고', '에휴', '웃음',
    '그떄', '그다음', '아마', '...', '…', '“', '”', '‘', '’', '-', '--', '―', '그냥', '진짜', '완전',
    '약간', '어떻게', '뭐', '딱', '막', '또', '또는', '해도', '생각', '중략', '가지', '조금',
    '다문화', '중국', '일본', '한국', '우즈베키스탄', '베트남', '태국', '몽골', '몽고',
    '자기', '그때', '그거', '수도', '그게', '여러', '무슨', '네네', '나이', '어디', '먼저', '대부분',
    '나중', '대해', '그것', '뭔가', '전혀', '저희', '만약', '이주', '나가야', '다른', '항상',
    '얘기',
])

standardization_dict = {
    "아빠": "남편", "여자": "여성", "아기": "아이", "한국말": "한국어",
    "시엄마": "시어머니", "어머니": "시어머니", "대한민국": "한국",
    "외국": "외국인", "외국사람": "외국인", "차별적": "차별", "무시당함": "무시",
    "무시함": "무시", "편견들": "편견", "회사": "직장", "업무": "직장",
    "일": "직장", "회사생활": "직장", "말": "언어", "한국사람": "한국인"
}
def clean_text(text):
    words = text.split()
    processed = []
    for word in words:
        word = standardization_dict.get(word, word)
        if word not in stopwords:
            processed.append(word)
    return " ".join(processed)

In [None]:
docs_cleaned = [
    clean_text(text)
    for text in df["원자료"].dropna().astype(str).tolist() ]

In [None]:
def preprocess_text(text, stopwords, standardization_dict):
    words = text.split()
    processed = []
    for word in words:
        word = standardization_dict.get(word, word)
        if word not in stopwords:
            processed.append(word)
    return processed

docs_tokenized = [preprocess_text(doc, stopwords, standardization_dict)
    for doc in docs_cleaned]

from gensim.corpora import Dictionary
id2word = Dictionary(docs_tokenized)

In [None]:
topic_nums = list(range(3, 21))

results = []

embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

for n in tqdm(topic_nums):
    topic_model = BERTopic(
        embedding_model=embedding_model,
        language="multilingual",
        calculate_probabilities=False,
        low_memory=True,
        n_gram_range=(1, 3)
    )

    topics, probs = topic_model.fit_transform(docs_cleaned)

    reduced_model = topic_model.reduce_topics(docs_cleaned, nr_topics=n)

    keywords = [reduced_model.get_topic(i) for i in range(n)]
    keywords = [topic for topic in keywords if topic]
    topic_words = [[word for word, _ in topic] for topic in keywords]

    topic_reprs = reduced_model.topic_embeddings_
    sim_matrix = cosine_similarity(topic_reprs)
    upper_tri = sim_matrix[np.triu_indices_from(sim_matrix, k=1)]
    similarity = np.mean(upper_tri)
    cosine_distance = 1 - similarity

    cm = CoherenceModel(
        topics=topic_words,
        texts=docs_tokenized,
        dictionary=id2word,
        coherence='c_v',
        processes=1)
    coherence = cm.get_coherence()

    results.append({
        "토픽수": n,
        "코사인거리(Cosine Distance)": cosine_distance,
        "일관성(Coherence)": coherence
    })

results_df = pd.DataFrame(results)
results_df

In [None]:
from google.colab import files

output_path = "/content/토픽수_코사인거리_코히런스결과.xlsx"
results_df.to_excel(output_path, index=False)
files.download(output_path)

In [None]:
!pip install -q sentence-transformers scikit-learn openpyxl

from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files
import io

uploaded = files.upload()

for filename in uploaded:
    if filename.endswith(".xlsx"):
        df = pd.read_excel(io.BytesIO(uploaded[filename]))
    elif filename.endswith(".csv"):
        df = pd.read_csv(io.BytesIO(uploaded[filename]))
    else:
        raise ValueError("지원되지 않는 파일 형식입니다.")

stopwords = set([
    '그', '이', '저', '것', '등', '때', '중', '누구', '무엇', '하다', '되다', '있다', '없다', '받다',
    '가다', '오다', '보다', '주다', '말하다', '들다', '계속', '이제', '지금', '예전', '요즘', '거의',
    '좀', '많이', '더', '다시', '별로', '그리고', '그래서', '그런데', '하지만', '그러니까', '때문',
    '이런', '그런', '어떤', '같다', '나', '내', '너', '우리', '당신', '그녀', '그들', '사람', '모두',
    '여기', '저기', '거기', '안', '밖', '위', '아래', '그곳', '방향', '도', '는', '만', '과', '와',
    '보다', '까지', '부터', '으로', '에게', '이랑', '밖에', '조차', '정도', '경우', '내용', '문제',
    '이야기', '모습', '상황', '자신', '음', '응', '오', '어휴', '허', '아이고', '에휴', '웃음',
    '그떄', '그다음', '아마', '...', '…', '“', '”', '‘', '’', '-', '--', '―', '그냥', '진짜', '완전',
    '약간', '어떻게', '뭐', '딱', '막', '또', '또는', '해도', '생각', '중략', '가지', '조금',
    '다문화', '중국', '일본', '한국', '우즈베키스탄', '베트남', '태국', '몽골', '몽고',
    '자기', '그때', '그거', '수도', '그게', '여러', '무슨', '네네', '나이', '어디', '먼저', '대부분',
    '나중', '대해', '그것', '뭔가', '전혀', '저희', '만약', '이주', '나가야', '다른', '항상',
    '얘기', '나라', '부분', '선생님'
])

standardization_dict = {
    "아빠": "남편", "여자": "여성", "아기": "아이", "한국말": "한국어",
    "시엄마": "시어머니", "어머니": "시어머니", "대한민국": "한국",
    "외국": "외국인", "외국사람": "외국인", "차별적": "차별", "무시당함": "무시",
    "무시함": "무시", "편견들": "편견", "회사": "직장", "업무": "직장",
    "일": "직장", "회사생활": "직장", "말": "언어", "한국사람": "한국인"
}

def preprocess_text(text):
    words = text.split()
    cleaned = []
    for word in words:
        word = standardization_dict.get(word, word)
        if word not in stopwords:
            cleaned.append(word)
    return " ".join(cleaned)

sentences = [preprocess_text(doc) for doc in df["원자료"].dropna().astype(str).tolist()]

model = SentenceTransformer("distiluse-base-multilingual-cased-v2")
embeddings = model.encode(sentences)

num_topics = 7
kmeans = KMeans(n_clusters=num_topics, random_state=42)
labels = kmeans.fit_predict(embeddings)

df = df.loc[df["원자료"].notna()].copy()
df["토픽번호"] = labels

pca = PCA(n_components=2)
reduced = pca.fit_transform(embeddings)
df["x"] = reduced[:, 0]
df["y"] = reduced[:, 1]

plt.figure(figsize=(8, 6))
for label in sorted(set(labels)):
    subset = df[df["토픽번호"] == label]
    plt.scatter(subset["x"], subset["y"], label=f"Topic {label}")
plt.title("BERT 임베딩 + KMeans 클러스터링 (토픽 수: 7)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend()
plt.grid(True)
plt.show()

output_path = "/content/최종_토픽_분석결과_7개.xlsx"
df.to_excel(output_path, index=False)
files.download(output_path)

In [None]:
from google.colab import files
uploaded = files.upload()

import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import matplotlib as mpl

font_path = "/content/NanumBarunGothic.ttf"
fm.fontManager.addfont(font_path)
font_name = fm.FontProperties(fname=font_path).get_name()

mpl.rc('font', family=font_name)
plt.rcParams['axes.unicode_minus'] = False

In [None]:
import matplotlib.pyplot as plt

topic_labels = {
    0: "외모/출신 국가에 대한 편견",
    1: "교육장면에서 발생하는 차별",
    2: "언어에 대한 차별",
    3: "무시와 인격적 경시",
    4: "사회적 배제와 소외시킴",
    5: "제도적 낙인과 의심",
    6: "성역할 고정관념"
}

plt.figure(figsize=(10, 8))
for topic_num in df['토픽번호'].unique():
    subset = df[df['토픽번호'] == topic_num]
    plt.scatter(subset['x'], subset['y'], label=topic_labels[topic_num], alpha=0.7)

plt.title("BERT 임베딩 + KMeans 클러스터링 (사용자 정의 토픽명)", fontsize=14)
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

original_sentences = df["원자료"].tolist()

for topic_idx in range(7):
    print(f"\n🧩 토픽 {topic_idx} 대표 문장:")

    center_vector = kmeans.cluster_centers_[topic_idx].reshape(1, -1)
    sims = cosine_similarity(center_vector, embeddings)[0]

    topic_sentences_idx = np.where(labels == topic_idx)[0]
    top_idxs = topic_sentences_idx[np.argsort(sims[topic_sentences_idx])[::-1][:5]]

    for idx in top_idxs:
        print(f"- {original_sentences[idx]}")

In [None]:
!pip install konlpy
from konlpy.tag import Okt
okt = Okt()

In [None]:
topwords = set([
    '그', '이', '저', '것', '등', '때', '중', '누구', '무엇',
    '하다', '되다', '있다', '없다', '받다', '가다', '오다', '보다', '주다', '말하다', '들다',
    '계속', '이제', '지금', '예전', '요즘', '거의', '좀', '많이', '더', '다시', '별로',
    '그리고', '그래서', '그런데', '하지만', '그러니까', '때문', '이런', '그런', '어떤', '같다',
    '나', '내', '너', '우리', '당신', '그녀', '그들', '사람', '모두',
    '여기', '저기', '거기', '안', '밖', '위', '아래', '그곳', '방향',
    '도', '는', '만', '과', '와', '보다', '까지', '부터', '으로', '에게', '이랑', '밖에', '조차',
    '정도', '경우', '내용', '문제', '이야기', '모습', '상황', '자신',
    '음', '응', '오', '어휴', '허', '아이고', '에휴', '웃음', '그떄', '그다음', '아마',
    '...', '…', '“', '”', '‘', '’', '-', '--', '―',
    '그냥', '진짜', '완전', '약간', '어떻게', '뭐', '딱', '막', '또', '또는',
    '해도', '생각', '중략', '가지', '조금', '다문화', '이제',
    '중국', '일본', '한국', '우즈베키스탄', '베트남', '태국', '몽골', '몽고',
    '자기', '그때', '그거', '수도', '그게', '여러', '무슨', '네네', '나이', '어디',
    '먼저', '대부분', '나중', '대해', '그것', '뭔가', '전혀', '저희', '만약', '이주', '나가야',
   '다른', '항상',  '얘기', '나라', '부분', '선생님'
])
standardization_dict = {
      "아빠": "남편", "여자":"여성", "아기":"아이",
    "한국말": "한국어", "시엄마": "시어머니", "어머니": "시어머니", "대한민국": "한국",
    "외국": "외국인", "외국사람": "외국인",
    "차별적": "차별",
    "무시당함": "무시", "무시함": "무시",
    "편견들": "편견", "회사":"직장", "업무":"직장", "일":"직장","회사생활":"직장",
    "말": "언어",
    "한국사람": "한국인"
}

In [None]:
def extract_keywords(text):
    words = text.split()
    standardized_words = [standardization_dict.get(w, w) for w in words]

    keywords = []
    for word in standardized_words:
        nouns = okt.nouns(word)
        for noun in nouns:
            if noun not in stopwords and len(noun) > 1:
                keywords.append(noun)
    return keywords

In [None]:
from collections import Counter

print("\n📌 각 토픽별 상위 키워드 (형태소 + 불용어 제거 + 표준화):")

for topic_idx in range(7):
    topic_docs = [
        extract_keywords(text)
        for i, text in enumerate(original_sentences)
        if labels[i] == topic_idx
    ]
    all_words = [word for doc in topic_docs for word in doc]

    if not all_words:
        print(f"\n🧩 토픽 {topic_idx}: 키워드 없음")
        continue

    top_words = Counter(all_words).most_common(10)
    print(f"\n🧩 토픽 {topic_idx} 키워드:")
    for word, freq in top_words:
        print(f"- {word} ({freq})")