In [12]:
import pandas as pd
import ast
import re

# Step 1: Load the cleaned CSV file
behaviors_df = pd.read_csv('./csv/restructured_behaviors.csv')

# Step 2: Load the keyword data for lookup
keyword_df = pd.read_csv('./keyword_summary/keyword_collect_summary_df.csv')

# Step 3: Create a dictionary for keyword lookup
keyword_info = {}
for idx, row in keyword_df.iterrows():
    keyword_lower = row['Keyword'].lower()
    keyword_info[keyword_lower] = {
        'Summary': ast.literal_eval(row['summary'])
    }

# Step 4: 정규 표현식 패턴을 정의하여 `keyword:XXX`를 추출하는 함수
def extract_keywords(behavior_str):
    # keyword: 뒤에 있는 키워드들을 추출하는 정규 표현식
    return re.findall(r'keyword:([a-zA-Z0-9]+)', behavior_str)

# Step 5: 키워드에 해당하는 클러스터와 설명을 추가하는 함수
def add_keyword_info(behavior_str):
    # 키워드를 추출
    keywords = extract_keywords(behavior_str)
    
    for keyword in keywords:
        if keyword in keyword_info:
            # 키워드 정보 얻기
            behavior_data = keyword_info[keyword]
            # 새로운 형식으로 효과를 추가
            keyword_format = f'{keyword}{{{behavior_data["Summary"]}}}'
            # 기존 효과 문자열에서 keyword 부분을 새 형식으로 교체
            behavior_str = behavior_str.replace(f'{keyword}', keyword_format).strip()
    
    return behavior_str

# Step 6: Process the skills data and add keyword information
def process_haviors(behaviors_df):
    new_behaviors = []
    for idx, row in behaviors_df.iterrows():
        new_behavior = row.copy()
        # 효과에서 키워드 정보 추가
        new_behavior['Processed_Behavior'] = add_keyword_info(row['Behavior_Description'].lower())
        new_behaviors.append(new_behavior)
    
    # 새로운 DataFrame을 생성하여 반환
    new_behaviors_df = pd.DataFrame(new_behaviors)
    return new_behaviors_df

# 변환 함수 정의
def clean_effects(text):
    if pd.isna(text):  # NaN 방지
        return ""
    
    # 1. "keyword:" 삭제
    text = re.sub(r'keyword:', '', text)
    
    # 2. {} -> () 변환
    text = re.sub(r'\{(.*?)\}', r'(\1)', text)
    
    return text

# Step 7: Process and save the updated skills data
new_behaviors_df = process_haviors(behaviors_df)
# 데이터 변환 적용
new_behaviors_df["Processed_Behavior"] = new_behaviors_df["Processed_Behavior"].apply(clean_effects)
new_behaviors_df = new_behaviors_df[['Identity_ID', 'Effect_Type', 'Processed_Behavior']]
new_behaviors_df.to_csv('./behavior_summary/updated_behaviors_add_keyword_summary.csv', index=False)
print(new_behaviors_df.head())  # 결과 확인

   Identity_ID      Effect_Type  \
0        10101          Passive   
1        10101  Support Passive   
2        10102          Passive   
3        10102  Support Passive   
4        10103          Passive   

                                  Processed_Behavior  
0  apply 1 attackdmgup(Deal more damage with skil...  
1  at the end of the turn, heal 10 sp for 1 ally ...  
2       in a clash, the opponent has -2 clash power.  
3  1 ally with the highest max hp gains +1 clash ...  
4  on clash win, gain +1 breath(On hit, gain a Po...  


In [15]:
import pandas as pd
from transformers import pipeline

# BART 모델 로드
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# CSV 파일 불러오기
df = pd.read_csv("./behavior_summary/updated_behaviors_add_keyword_summary.csv")

# 요약 함수 정의
def summarize_text(text):
    summary = summarizer(text, max_length=50, min_length=10, do_sample=False)
    return summary[0]['summary_text']

# 요약 실행
df['Summary_Effect'] = df['Processed_Behavior'].apply(summarize_text)

# 결과 저장
df = df[['Identity_ID', 'Effect_Type', 'Summary_Effect']]
output_csv_path = "./behavior_summary/updated_behaviors_summary_bart_large.csv"
df.to_csv(output_csv_path, index=False)

print(f"✅ 요약된 데이터가 '{output_csv_path}' 파일로 저장되었습니다.")

Device set to use mps:0
Your max_length is set to 50, but your input_length is only 39. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
  test_elements = torch.tensor(test_elements)
Your max_length is set to 50, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 50, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 50, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasin

✅ 요약된 데이터가 './behavior_summary/updated_behaviors_summary_bart_large.csv' 파일로 저장되었습니다.


In [16]:
import pandas as pd
from transformers import pipeline

# BART 모델 로드
summarizer = pipeline("summarization", model="google/flan-t5-base")

# CSV 파일 불러오기
df = pd.read_csv("./behavior_summary/updated_behaviors_add_keyword_summary.csv")

# 요약 함수 정의
def summarize_text(text):
    summary = summarizer(text, max_length=50, min_length=10, do_sample=False)
    return summary[0]['summary_text']

# 요약 실행
df['Summary_Effect'] = df['Processed_Behavior'].apply(summarize_text)

# 결과 저장
df = df[['Identity_ID', 'Effect_Type', 'Summary_Effect']]
output_csv_path = "./behavior_summary/updated_behaviors_summary_t5_base.csv"
df.to_csv(output_csv_path, index=False)

print(f"✅ 요약된 데이터가 '{output_csv_path}' 파일로 저장되었습니다.")

Device set to use mps:0
Your max_length is set to 50, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
  test_elements = torch.tensor(test_elements)
Your max_length is set to 50, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)
Your max_length is set to 50, but your input_length is only 16. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
Your max_length is set to 50, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasin

✅ 요약된 데이터가 './behavior_summary/updated_behaviors_summary_t5_base.csv' 파일로 저장되었습니다.


In [17]:
import pandas as pd
from transformers import pipeline

# BART 모델 로드
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# CSV 파일 불러오기
df = pd.read_csv("./behavior_summary/updated_behaviors_add_keyword_summary.csv")

# 요약 함수 정의
def summarize_text(text):
    summary = summarizer(text, max_length=50, min_length=10, do_sample=False)
    return summary[0]['summary_text']

# 요약 실행
df['Summary_Effect'] = df['Processed_Behavior'].apply(summarize_text)

# 결과 저장
df = df[['Identity_ID', 'Effect_Type', 'Summary_Effect']]
output_csv_path = "./behavior_summary/updated_behaviors_summary_distilbart_cnn.csv"
df.to_csv(output_csv_path, index=False)

print(f"✅ 요약된 데이터가 '{output_csv_path}' 파일로 저장되었습니다.")

Device set to use mps:0
Your max_length is set to 50, but your input_length is only 39. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
  test_elements = torch.tensor(test_elements)
Your max_length is set to 50, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 50, but your input_length is only 14. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Your max_length is set to 50, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasin

✅ 요약된 데이터가 './behavior_summary/updated_behaviors_summary_distilbart_cnn.csv' 파일로 저장되었습니다.


In [19]:
import pandas as pd
from transformers import pipeline
from collections import defaultdict

# BART 모델 로드
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# CSV 파일 불러오기
df = pd.read_csv("./behavior_summary/updated_behaviors_summary_bart_large.csv")

# Identity_ID별로 효과 합치기
def summarize_by_behaviors(df):
    identity_summary = defaultdict(str)

    for _, row in df.iterrows():
        identity_id = row['Identity_ID']
        summarized_behavior = str(row['Summary_Effect'])
        identity_summary[identity_id] += summarized_behavior + " "  # 효과 합침

    # 합쳐진 텍스트에 대해 요약
    summarized_identities = []
    for identity_id, combined_behavior in identity_summary.items():
        summary = summarizer(combined_behavior, max_length=50, min_length=10, do_sample=False)
        summarized_behavior = summary[0]['summary_text']
        summarized_identities.append({
            'Identity_ID': identity_id,
            'Behavior_Summary': summarized_behavior
        })

    return pd.DataFrame(summarized_identities)

# 결과 출력
summarized_behavior_df = summarize_by_behaviors(df)

# 결과 저장
output_csv_path = "./behavior_summary/summarized_per_identity_behavior_bart_large.csv"
summarized_behavior_df.to_csv(output_csv_path, index=False)

print(f"✅ 요약된 데이터가 '{output_csv_path}' 파일로 저장되었습니다.")

Device set to use mps:0
  test_elements = torch.tensor(test_elements)
Your max_length is set to 50, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 50, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max_length is set to 50, but your input_length is only 35. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)
Your max_length is set to 50, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasi

✅ 요약된 데이터가 './behavior_summary/summarized_per_identity_behavior_bart_large.csv' 파일로 저장되었습니다.


In [20]:
import pandas as pd
from transformers import pipeline
from collections import defaultdict

# BART 모델 로드
summarizer = pipeline("summarization", model="google/flan-t5-base")

# CSV 파일 불러오기
df = pd.read_csv("./behavior_summary/updated_behaviors_summary_t5_base.csv")

# Identity_ID별로 효과 합치기
def summarize_by_behaviors(df):
    identity_summary = defaultdict(str)

    for _, row in df.iterrows():
        identity_id = row['Identity_ID']
        summarized_behavior = str(row['Summary_Effect'])
        identity_summary[identity_id] += summarized_behavior + " "  # 효과 합침

    # 합쳐진 텍스트에 대해 요약
    summarized_identities = []
    for identity_id, combined_behavior in identity_summary.items():
        summary = summarizer(combined_behavior, max_length=50, min_length=10, do_sample=False)
        summarized_behavior = summary[0]['summary_text']
        summarized_identities.append({
            'Identity_ID': identity_id,
            'Behavior_Summary': summarized_behavior
        })

    return pd.DataFrame(summarized_identities)

# 결과 출력
summarized_behavior_df = summarize_by_behaviors(df)

# 결과 저장
output_csv_path = "./behavior_summary/summarized_per_identity_behavior_t5_base.csv"
summarized_behavior_df.to_csv(output_csv_path, index=False)

print(f"✅ 요약된 데이터가 '{output_csv_path}' 파일로 저장되었습니다.")

Device set to use mps:0
Your max_length is set to 50, but your input_length is only 41. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=20)
  test_elements = torch.tensor(test_elements)
Your max_length is set to 50, but your input_length is only 38. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
Your max_length is set to 50, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 50, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasi

✅ 요약된 데이터가 './behavior_summary/summarized_per_identity_behavior_t5_base.csv' 파일로 저장되었습니다.


In [21]:
import pandas as pd
from transformers import pipeline
from collections import defaultdict

# BART 모델 로드
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# CSV 파일 불러오기
df = pd.read_csv("./behavior_summary/updated_behaviors_summary_distilbart_cnn.csv")

# Identity_ID별로 효과 합치기
def summarize_by_behaviors(df):
    identity_summary = defaultdict(str)

    for _, row in df.iterrows():
        identity_id = row['Identity_ID']
        summarized_behavior = str(row['Summary_Effect'])
        identity_summary[identity_id] += summarized_behavior + " "  # 효과 합침

    # 합쳐진 텍스트에 대해 요약
    summarized_identities = []
    for identity_id, combined_behavior in identity_summary.items():
        summary = summarizer(combined_behavior, max_length=50, min_length=10, do_sample=False)
        summarized_behavior = summary[0]['summary_text']
        summarized_identities.append({
            'Identity_ID': identity_id,
            'Behavior_Summary': summarized_behavior
        })

    return pd.DataFrame(summarized_identities)

# 결과 출력
summarized_behavior_df = summarize_by_behaviors(df)

# 결과 저장
output_csv_path = "./behavior_summary/summarized_per_identity_behavior_distilbart_cnn.csv"
summarized_behavior_df.to_csv(output_csv_path, index=False)

print(f"✅ 요약된 데이터가 '{output_csv_path}' 파일로 저장되었습니다.")

Device set to use mps:0
  test_elements = torch.tensor(test_elements)
Your max_length is set to 50, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Your max_length is set to 50, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 50, but your input_length is only 44. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
Your max_length is set to 50, but your input_length is only 44. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasi

✅ 요약된 데이터가 './behavior_summary/summarized_per_identity_behavior_distilbart_cnn.csv' 파일로 저장되었습니다.


In [None]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import umap
import hdbscan
import pandas as pd

# CSV 파일 불러오기
identities_skills_summary_df = pd.read_csv("./identities_skills_summary/summarized_per_identity_bart_large_cnn.csv")
behaviors_summary_df = pd.read_csv("./behavior_summary/summarized_per_identity_behavior_bart_large.csv")

merged_df = identities_skills_summary_df.merge(
    behaviors_summary_df, on="Identity_ID", how="inner"
)

combined_list = (merged_df["Skills_Summary"].astype(str) + " " + merged_df["Behavior_Summary"].astype(str)).tolist()


# ✅ SBERT 모델 로드 (이 모델이 문장을 숫자로 변환)
model = SentenceTransformer("all-MiniLM-L6-v2")

# 문장 임베딩 벡터 변환 (각 문장이 384차원의 벡터로 변환됨)
embeddings = model.encode(combined_list, convert_to_numpy=True)

# 3️⃣ UMAP을 이용해 고차원 벡터를 저차원으로 축소 (군집화 성능 향상)
umap_model = umap.UMAP(n_neighbors=5, n_components=5, metric='cosine')

# 4️⃣ HDBSCAN으로 밀집도를 기반으로 자동 군집화 수행
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom')

# 5️⃣ BERTopic을 이용해 토픽 모델링 실행
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)

# 6️⃣ 토픽 할당 및 문장별 결과 확인
topics, probs = topic_model.fit_transform(combined_list, embeddings)

# 7️⃣ 결과 출력
print("📌 각 문장의 토픽 번호:", topics)
print("\n🔍 토픽별 정보:")
print(topic_model.get_topic_info())

# 8️⃣ 특정 토픽에 속한 문장 확인
for topic_num in set(topics):
    if topic_num != -1:  # -1은 군집화되지 않은 문장
        print(f"\n🔥 토픽 {topic_num} 관련 문장들:")
        print(topic_model.get_topic(topic_num))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [1]:
import pandas as pd

clustered_df = pd.read_csv("./identities_summary/update_topic_modeling_clustered_identity.csv")
clustered_df = clustered_df[["Identity_ID","Name","T5_Topic_Number"]]

output_csv_path = "./identities_summary/clustered_identities_t5_classifier.csv"
clustered_df.to_csv(output_csv_path, index=False)
print(f"✅ 요약된 데이터가 '{output_csv_path}' 파일로 저장되었습니다.")

✅ 요약된 데이터가 './identities_summary/clustered_identities_t5_classifier.csv' 파일로 저장되었습니다.
