<a href="https://colab.research.google.com/github/icecat14159/PL-Repo./blob/main/HW04_%E7%B6%B2%E9%A0%81%E7%88%AC%E8%9F%B2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [97]:
import requests
from bs4 import BeautifulSoup

# 不含 page 參數的主網址
# 原始網址: https://forum.gamer.com.tw/C.php?page=1&bsn=33651&snA=8402&tnum=27&bPage=2
# 從網址結構來看，page 參數應該是控制文章列表的頁數，而不是單一文章內的樓層。
# 要抓取單一文章的樓層，可能需要分析該文章頁面的結構，通常樓層會有特定的 class 或 id。
# 觀察提供的網址，它已經包含了 bPage=2 這個參數，可能就是控制樓層頁數的。
# 我們可以嘗試移除 page=1 這個參數，並讓 bPage 參數來控制頁數。

base_url = "https://forum.gamer.com.tw/C.php?bsn=33651&snA=8402&tnum=27"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/118.0.5993.90 Safari/537.36",
    "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7"
}

all_posts = []
total_pages = 2  # 可依實際頁數調整，根據 bPage 參數來判斷

for page_num in range(1, total_pages + 1):
    # 這裡使用 bPage 參數來控制樓層頁數
    url = f"{base_url}&bPage={page_num}"
    print(f"正在抓取網址: {url}") # 加入print檢查抓取的網址
    res = requests.get(url, headers=headers)
    res.encoding = "utf-8"
    soup = BeautifulSoup(res.text, "html.parser")

    # 嘗試抓樓層內容（新版巴哈用 class="c-article__content" 或 "c-article__content-inner"）
    # 經過檢查網頁原始碼，文章內容的 class 確實是 c-article__content 或 c-article__content-inner
    # 但可能需要更精確的選擇器來排除其他非文章樓層的內容。
    # 我們可以檢查每個找到的元素是否確實是文章樓層。
    posts = soup.select("div.c-article__content, div.c-article__content-inner")

    # 過濾掉可能是廣告或其他非文章內容的區塊 (如果需要)
    # 例如，可以檢查父元素是否有特定的 class 來判斷是否是文章樓層
    # 這裡暫時不加過濾，先看抓取結果

    all_posts.extend([post for post in posts if post.get_text(strip=True)]) #只加入非空內容
    print(f"已從第 {page_num} 頁抓取到 {len(posts)} 個可能的文章區塊。") # 修改print訊息

print(f"總共抓取到 {len(all_posts)} 篇文章。")

正在抓取網址: https://forum.gamer.com.tw/C.php?bsn=33651&snA=8402&tnum=27&bPage=1
已從第 1 頁抓取到 0 個可能的文章區塊。
正在抓取網址: https://forum.gamer.com.tw/C.php?bsn=33651&snA=8402&tnum=27&bPage=2
已從第 2 頁抓取到 0 個可能的文章區塊。
總共抓取到 0 篇文章。


In [None]:
!pip install gspread
from google.colab import auth
import gspread
from google.auth import default

auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

# 連結你的試算表
sheet_url = "https://docs.google.com/spreadsheets/d/1MMbLQMGk0ZqohkZP-n5TTVvPfH4xwU5COzOUtABdVKY/edit?usp=sharing"
sh = gc.open_by_url(sheet_url)
ws = sh.sheet1



In [None]:
# 确保 posts 变量包含您要写入的数据
# posts = soup.select("div.c-article__content") # 假设 posts 变量已经从之前的步骤中获取

# 准备要写入的数据
data_to_write = []
# 修改这里，使用 all_posts 变量
for i, post in enumerate(all_posts, start=1):
    content = post.get_text(strip=True)
    data_to_write.append([f"第{i}樓文章：", content])

# 将数据写入 Google Sheet
# 从第2行开始写入数据，因为第一行可能是标题
start_row = 2
for row_index, row_data in enumerate(data_to_write):
    # 计算当前行号
    current_row = start_row + row_index
    # 更新整行数据
    ws.update(range_name=f'A{current_row}:B{current_row}', values=[row_data])

print("数据已成功写入 Google Sheet！")

数据已成功写入 Google Sheet！


In [None]:
# ---------------- Gemini 設定 ----------------
from google.colab import userdata
import google.generativeai as genai # 导入 genai 库
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
gemini_model = genai.GenerativeModel('models/gemini-flash-latest')

In [None]:
# Select the first worksheet
worksheet1 = sh.sheet1

# Read all data from the worksheet
all_data = worksheet1.get_all_values()

# Remove the header row assuming the first row is the header
if all_data:
    all_data = all_data[1:]

print(f"Successfully read {len(all_data)} rows of data from the worksheet.")

Successfully read 25 rows of data from the worksheet.


In [None]:
prompt_prefix = "以下是攻略網站底下的各樓文章內容，請給我5句洞察摘要以及一段120字的總結\n\n"
combined_articles = ""

for row in all_data:
  # Assuming the article content is in the second column (index 1)
  if len(row) > 1:
    combined_articles += row[1] + "\n---\n" # Add a separator

gemini_input = prompt_prefix + combined_articles

print("Combined articles and prompt prefix created.")

Combined articles and prompt prefix created.


In [None]:
try:
    gemini_response = gemini_model.generate_content(gemini_input)
    print("Gemini API call successful.")
except Exception as e:
    print(f"Error calling Gemini API: {e}")
    gemini_response = None # Set response to None in case of error

Gemini API call successful.


In [None]:
if gemini_response:
  response_text = gemini_response.text

  # Assuming Gemini separates the sections with specific markers or headings.
  # This is a placeholder and needs to be adjusted based on actual Gemini output format.
  # For example, if Gemini uses "洞察摘要:" and "总结:"
  insight_marker = "洞察摘要:"
  summary_marker = "总结:"

  insight_summary = ""
  overall_summary = ""

  if insight_marker in response_text:
      insight_start = response_text.find(insight_marker) + len(insight_marker)
      if summary_marker in response_text:
          summary_start = response_text.find(summary_marker)
          insight_summary = response_text[insight_start:summary_start].strip()
          overall_summary = response_text[summary_start + len(summary_marker):].strip()
      else:
          # If no summary marker, assume the rest is insight summary
          insight_summary = response_text[insight_start:].strip()
  elif summary_marker in response_text:
      # If no insight marker, assume the rest is overall summary
      overall_summary = response_text[response_text.find(summary_marker) + len(summary_marker):].strip()
  else:
      # If no markers found, treat the whole response as overall summary
      overall_summary = response_text.strip()

  print("Successfully extracted insight summary and overall summary.")
  print("Insight Summary:", insight_summary)
  print("Overall Summary:", overall_summary)
else:
  print("No Gemini response available to extract summaries.")

Successfully extracted insight summary and overall summary.
Insight Summary: 
Overall Summary: **5句洞察摘要**

1. **極端困難與高挫折感：** 保全派駐初期設計極度硬核，要求無失敗通關長達兩小時才能獲得隱藏獎勵，導致玩家社群普遍認為其是耗費生命且設計不良的內容。
2. **核心策略：牌組純化：** 成功依賴於精簡初始隊伍至單一輸出核心（如澄閃或鈴蘭）和低費輔助/零件，目的是在前期回合內快速疊滿核心幹員的五層戰術裝備。
3. **主流核心角色確立：** 模式中最有效的核心幹員是能夠提供全圖法傷（澄閃）、極致緩速或永控（鈴蘭、靈知、異客）以及自我續航站場（泥岩、水月）的角色。
4. **難度集中於後半段：** 流程的挑戰集中在LT-6、LT-7和LT-8，這些關卡要求嚴苛的火力檢定，並引入 Boss 專屬的牌組干擾或複雜的阻擋機制。
5. **官方雖有改版但評價仍低：** 儘管官方持續透過增加新裝置、調整難度並最終加入掃蕩功能，試圖緩解痛苦，但模式的重複性和對特定高練度角色的依賴性，使多數玩家仍選擇直接抄襲攻略以快速完成任務。

---

**總結 (120字)**

保全派駐因其初始的長時間、無失敗限制以及隱藏章獲取條件，被玩家視為一項高壓且耗時的挑戰。通關策略核心在於純化牌組，利用低費輔助迅速堆疊指定核心幹員（如澄閃、鈴蘭、泥岩）的戰術裝備。流程難點集中於後半段的火力檢定及Boss戰的牌組干擾機制。模式雖經歷多次更新，增加了多樣化裝置並最終提供掃蕩，但其本質上的重複作業感，導致玩家普遍採取抄作業的速通方式，以應付周期性開放的模組獎勵獲取需求。


In [None]:
# Select the second worksheet
worksheet2 = sh.get_worksheet(1) # get_worksheet(1) gets the second worksheet (0-indexed)

# Prepare the data to be written
data_to_write_ws2 = [['洞察摘要', '总结'], [insight_summary, overall_summary]]

# Clear any existing content in the second worksheet
worksheet2.clear()

# Write the prepared data to the second worksheet
worksheet2.update(range_name='A1', values=data_to_write_ws2)

print("数据已成功写入 Google Sheet 的工作表 2！")

数据已成功写入 Google Sheet 的工作表 2！


In [None]:
# 安裝中文斷詞與TF-IDF套件
!pip install jieba scikit-learn gspread oauth2client > /dev/null

import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
import gspread
from google.colab import auth
from google.auth import default

# ==================== Google Sheet 驗證 ====================
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

# 你的試算表網址
sheet_url = "https://docs.google.com/spreadsheets/d/1MMbLQMGk0ZqohkZP-n5TTVvPfH4xwU5COzOUtABdVKY/edit?usp=sharing"
sh = gc.open_by_url(sheet_url)

worksheet1 = sh.worksheet("工作表1")
worksheet3 = sh.worksheet("工作表3")

# ==================== 中文斷詞 + TF-IDF 熱詞分析 ====================
def tfidf_analysis_to_sheet(top_n=20):
    """
    從工作表1的B欄讀取文章內容，進行TF-IDF分析，將結果寫入工作表3
    top_n: 取前N個熱詞
    """
    try:
        # 1. 讀取工作表1所有資料
        all_values = worksheet1.get_all_values()
        if len(all_values) < 1:
            return "工作表1沒有資料可以分析。"

        # 2. 只取B欄（索引1）
        texts = [row[1] for row in all_values if len(row) > 1 and row[1].strip()]
        if not texts:
            return "文章內容為空，無法分析。"

        # 3. 中文斷詞處理
        texts_cut = [" ".join(jieba.cut(text)) for text in texts]

        # 4. 使用 TF-IDF 計算
        vectorizer = TfidfVectorizer(max_features=1000)
        tfidf_matrix = vectorizer.fit_transform(texts_cut)
        feature_names = vectorizer.get_feature_names_out()

        # 5. 計算每個詞的總分
        tfidf_sum = tfidf_matrix.sum(axis=0)
        tfidf_scores = [(word, tfidf_sum[0, idx]) for idx, word in enumerate(feature_names)]
        tfidf_scores.sort(key=lambda x: x[1], reverse=True)

        # 6. 取前 N 熱詞
        top_words = tfidf_scores[:top_n]

        # 7. 寫入 Google Sheet 工作表3
        worksheet3.clear()
        worksheet3.append_row(["詞語", "TF-IDF 分數"])
        for word, score in top_words:
            worksheet3.append_row([word, round(float(score), 6)])

        return f"TF-IDF 熱詞分析完成，前 {top_n} 熱詞已寫入『工作表3』。"

    except Exception as e:
        return f"分析過程中發生錯誤: {e}"

# ==================== 執行分析 ====================
print(tfidf_analysis_to_sheet(top_n=30))

  re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
  re_skip_default = re.compile("(\r\n|\s)", re.U)
  re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")
Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.838 seconds.
DEBUG:jieba:Loading model cost 0.838 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


TF-IDF 熱詞分析完成，前 30 熱詞已寫入『工作表3』。
