In [1]:
import pandas as pd
from pymongo import MongoClient
from IPython.display import display  # 用於在 Notebook 中更美觀地顯示 DataFrame

In [2]:
# MongoDB 連線資訊 (請替換成你的實際資訊)
mongo_uri = 'mongodb://localhost:27017/'  # 預設本機連線
database_name = 'mydatabase'         # 你的資料庫名稱
collection_name = 'reviews'        # 你的 Collection 名稱

try:
    client = MongoClient(mongo_uri)
    db = client[database_name]
    collection = db[collection_name]
    print(f"成功連線到 MongoDB 資料庫 '{database_name}'，Collection: '{collection_name}'")
except Exception as e:
    print(f"連線 MongoDB 失敗: {e}")
    exit()

成功連線到 MongoDB 資料庫 'mydatabase'，Collection: 'reviews'


In [3]:
print("\n--- 資料庫和 Collection 概覽 ---")

# 顯示資料庫名稱列表
database_names = client.list_database_names()
print("資料庫列表:", database_names)

# 顯示 Collection 名稱列表 (在當前資料庫中)
collection_names = db.list_collection_names()
print("Collection 列表:", collection_names)

# 顯示 Collection 文件數量
document_count = collection.count_documents({})
print(f"\nCollection '{collection_name}' 中的文件總數: {document_count}")


--- 資料庫和 Collection 概覽 ---
資料庫列表: ['admin', 'config', 'local', 'mydatabase']
Collection 列表: ['reviews']

Collection 'reviews' 中的文件總數: 568454


In [4]:
print("\n--- 檢視前 5 筆資料 (確認資料結構) ---")

# 從 MongoDB 查詢前 5 筆資料
cursor = collection.find().limit(5)
sample_data = list(cursor)

# 使用 pandas DataFrame 顯示，更易於閱讀
df_sample = pd.DataFrame(sample_data)
display(df_sample)


--- 檢視前 5 筆資料 (確認資料結構) ---


Unnamed: 0,_id,ProductId,ProductInfo,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Id
0,67d8e92ae0757cc683660037,B001E4KFG0,{'URL': 'https://www.amazon.com/dp/B001E4KFG0'},A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1
1,67d8e92ae0757cc683660038,B00813GRG4,{'URL': 'https://www.amazon.com/dp/B00813GRG4'},A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,2
2,67d8e92ae0757cc683660039,B000LQOCH0,{'URL': 'https://www.amazon.com/dp/B000LQOCH0'},ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,3
3,67d8e92ae0757cc68366003a,B000UA0QIQ,{'URL': 'https://www.amazon.com/dp/B000UA0QIQ'},A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,4
4,67d8e92ae0757cc68366003b,B006K2ZZ7K,{'URL': 'https://www.amazon.com/dp/B006K2ZZ7K'},A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,5


In [5]:
print("\n--- 基本資料分析範例 ---")

# 5.1. 評分 (Score) 分布
print("\n5.1. 評分 (Score) 分布:")
score_counts = df_sample['Score'].value_counts().sort_index()
print(score_counts)

# 5.2. 平均評分
print(f"\n5.2. 平均評分: {df_sample['Score'].mean():.2f}")

# 5.3. 顯示評論最多的前 10 個 ProductId
print("\n5.3. 評論最多的前 10 個 ProductId:")
product_review_counts = df_sample['ProductId'].value_counts().nlargest(10)
print(product_review_counts)

# 5.4. 顯示評論最多的前 10 個 UserId
print("\n5.4. 評論最多的前 10 個 UserId:")
user_review_counts = df_sample['UserId'].value_counts().nlargest(10)
print(user_review_counts)


--- 基本資料分析範例 ---

5.1. 評分 (Score) 分布:
Score
1    1
2    1
4    1
5    2
Name: count, dtype: int64

5.2. 平均評分: 3.40

5.3. 評論最多的前 10 個 ProductId:
ProductId
B001E4KFG0    1
B00813GRG4    1
B000LQOCH0    1
B000UA0QIQ    1
B006K2ZZ7K    1
Name: count, dtype: int64

5.4. 評論最多的前 10 個 UserId:
UserId
A3SGXH7AUHU8GW    1
A1D87F6ZCVE5NK    1
ABXLMWJIXXAIN     1
A395BORC6FGVXV    1
A1UQRSCLF8GW1T    1
Name: count, dtype: int64
