In [2]:
import requests
import pandas as pd
import json
import re
from collections import Counter
from konlpy.tag import Okt
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
import nltk
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import os

# 워드클라우드 폰트 경로 설정 (한글 지원용)
FONT_PATH = "C:/Windows/Fonts/malgun.ttf"  # 윈도우 기준

# 1. 리뷰 가져오기
def fetch_steam_reviews(appid, language='all', count=100):
    url = f'https://store.steampowered.com/appreviews/{appid}?json=1'
    params = {
        'filter': 'all',
        'language': language,
        'review_type': 'all',
        'num_per_page': count
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    return None

# 2. 리뷰 CSV 저장
def save_reviews_to_csv(appid):
    languages = {'korean': 100, 'english': 100}
    all_reviews = []

    for lang, count in languages.items():
        data = fetch_steam_reviews(appid, language=lang, count=count)
        if data and 'reviews' in data:
            for review in data['reviews']:
                all_reviews.append({
                    'language' : lang,
                    'review_text' : review['review'],
                    'helpful' : review['votes_up']
                })
                
    filename = f"../Output/steam_reviews_{appid}.csv"    
    df = pd.DataFrame(all_reviews)
    df.to_csv(filename, index=False, encoding='utf-8-sig')
    print(f"Saved {len(df)} reviews to {filename}")
    return filename

# 3. load_stopwords() 함수
def load_stopwords(filepath):
    if not os.path.exists(filepath):
        print(f"[경고] 불용어 파일이 존재하지 않습니다: {filepath}")
        return set()
    
    with open(filepath, 'r', encoding='utf-8') as f:
        stopwords = set([line.strip() for line in f if line.strip()])
    return stopwords


# 3. 키워드 추출 함수 (갯수 조절 가능)
def extract_keywords(text, is_korean=False, custom_stopwords=None, top_n=20, extra_stopwords=None):
    if is_korean:
        okt = Okt()
        words = okt.nouns(text)

        if custom_stopwords is None:
            custom_stopwords = load_stopwords("../Data/stopwords-ko.txt")

        if extra_stopwords:
            custom_stopwords.update(extra_stopwords)

        words = [word for word in words if word not in custom_stopwords]

    else:
        tokenizer = TreebankWordTokenizer()
        words = tokenizer.tokenize(text)
        words = [word.lower() for word in words if word.isalnum()]

        if custom_stopwords is None:
            custom_stopwords = load_stopwords("../Data/stopwords-en.txt")

        if extra_stopwords:
            custom_stopwords.update(extra_stopwords)

        words = [word for word in words if word not in custom_stopwords]

    return Counter(words).most_common(top_n)



# 4. 리뷰에서 언어별 키워드 추출
def process_reviews_for_keywords(csv_file, top_n=20,
                                  custom_stopwords=None,
                                  extra_korean_stopwords=None,
                                  extra_english_stopwords=None):
    df = pd.read_csv(csv_file)
    keyword_results = []

    for lang in ['korean', 'english']:
        is_korean = (lang == 'korean')
        texts = ' '.join(df[df['language'] == lang]['review_text'].dropna())

        # extra_stopwords는 언어별로 다르게 전달
        extra = extra_korean_stopwords if is_korean else extra_english_stopwords

        keywords = extract_keywords(
            texts,
            is_korean=is_korean,
            custom_stopwords=custom_stopwords,
            top_n=top_n,
            extra_stopwords=extra
        )

        keyword_results.append({
            'language': lang,
            'keywords': keywords
        })

    return keyword_results


# 5. 워드클라우드 생성 및 저장
def generate_wordcloud(keywords, language):
    word_dict = dict(keywords)
    wc = WordCloud(
        font_path=FONT_PATH,
        background_color=None, # 'white'
        colormap='Set2',
        mode='RGBA',
        width=800,
        height=400
    ).generate_from_frequencies(word_dict)

    csv_output_path = f"../Output/keyword_freq_{language}.csv"
    pd.DataFrame(keywords, columns=['word', 'frequency']).to_csv(csv_output_path, index=False, encoding='utf-8-sig')
    print(f"{language} 키워드 빈도 CSV 저장 완료 → {csv_output_path}")

    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"{language.capitalize()} 리뷰에 대한 워드 클라우드")
    
    output_path = f"../Output/wordcloud_{language}.png"
    wc.to_file(output_path)
    print(f"{language} 워드클라우드 저장 완료 → {output_path}")
    plt.close()

# 6. 전체 흐름 실행 함수
def run_analysis(appid, top_n=20, extra_korean_stopwords=None, extra_english_stopwords=None):
    csv_path = save_reviews_to_csv(appid)
    results = process_reviews_for_keywords(
        csv_file=csv_path,
        top_n=top_n,
        extra_korean_stopwords=extra_korean_stopwords,
        extra_english_stopwords=extra_english_stopwords
    )

    for result in results:
        generate_wordcloud(result['keywords'], result['language'])


In [3]:
my_korean_stopwords = {'게임', '진짜', '완전'}
my_english_stopwords = {'game', 'really', 'lol'}

run_analysis(appid=2456740, 
             top_n=1000,
             extra_korean_stopwords=my_korean_stopwords,
             extra_english_stopwords=my_english_stopwords)

Saved 200 reviews to ../Output/steam_reviews_2456740.csv
korean 키워드 빈도 CSV 저장 완료 → ../Output/keyword_freq_korean.csv
korean 워드클라우드 저장 완료 → ../Output/wordcloud_korean.png
english 키워드 빈도 CSV 저장 완료 → ../Output/keyword_freq_english.csv
english 워드클라우드 저장 완료 → ../Output/wordcloud_english.png
