In [None]:
!pip3 install pymongo

In [2]:
import requests
import pymongo
import configparser
import ssl
import json
import datetime
from lxml import etree
from bson.objectid import ObjectId

In [None]:
# 查詢本機 Colab 的對外 IP，加到 MongoDB Atlas 的 Security -> Network Access 裡
response = requests.get('https://api.ipify.org')
if response.status_code == 200:
    print('My public IP address is:', response.text.strip())
else:
    print('Failed to get public IP address')

config.ini ，內容如下

```
[mongodb]
username = <username>
password = <password>
clustername = <clustername>
```




In [None]:
# 讀取外部配置檔案
config = configparser.ConfigParser()
config.read('config.ini')

In [5]:
# 取得 MongoDB 連線 URI
username = config.get('mongodb', 'username')
password = config.get('mongodb', 'password')
clustername = config.get('mongodb', 'clustername')
dbUri = f"mongodb+srv://{username}:{password}@{clustername}/?retryWrites=true&w=majority"


In [6]:
# 資料庫連線設定
connection = pymongo.MongoClient(
    dbUri
)
db = connection['dataScience']
sourceCollection = db['crawlerSource']

In [None]:
cursor = sourceCollection.find()
# 遍歷 Cursor 物件，取得查詢結果
for document in cursor:
    print(document)

In [14]:
# Crawler Source Model
def initSourceModel(source):
  timestamp = int(datetime.datetime.now().timestamp() * 1000)

  topics = []
  for topic in source["topics"]:
    topics.append(
        {"topic": topic["topic"],
         "description": topic["description"],
         "requestUrl": topic["requestUrl"],
         "createdAt":{"$numberLong":str(timestamp)},
         "updatedAt": None,
         "lastSync": None,
         "available": True})

  model = {
    "name": source["name"],
    "description": source["description"],
    "topics": topics,
    "sourceDomain": source["domain"],
    "crawlerSchema": source["schema"],
    "createdAt":{"$numberLong":str(timestamp)},
    "updatedAt": None,
    "scheduleSync": None,
    "lastSync": None,
    "enabled": True
  }

  return model

In [15]:
# 創建 Crawler Source 內容
source = {
    "name": "天下雜誌",
    "description": "天下雜誌，是台灣第一本專業的新聞財經雜誌。每日精選財經、國際、管理、教育、經濟學人、評論、時尚；互動圖表、影音等多媒體報導，深入解讀世界脈動，掌握前瞻觀念。",
    "topics": [{
        "topic": "美國銀行",
        "description": "因為美國矽谷銀行倒閉事件，欲觀察美國 Nasdaq 指數與該事件之關係，使用 Google 新聞搜尋關鍵字【美國銀行】，並指定網站，取得天下雜誌關於列表。",
        "requestUrl": "https://www.google.com/search?q=%E7%BE%8E%E5%9C%8B%E9%8A%80%E8%A1%8C+site:cw.com.tw&tbm=nws"
    }],
    "domain": "www.cw.com.tw",
    "schema": ""
}

# 初始化 MongoDB 所需的 Source Model 映射欄位
sourceModel = initSourceModel(source)
# 將 dict 轉成 MongoDB 的 document 格式
sourceDocument = json.loads(json.dumps(sourceModel))

# 插入 document 到 collection，並取得回傳物件
try:
    sourceResult = sourceCollection.insert_one(sourceDocument)
    source["oid"] = sourceResult.inserted_id
    print("Document inserted with oid: ", source["oid"])
except Exception as e:
    print("Error inserting document: ", e)

JSON 檔案來源： https://github.com/jungyu/bigdata-scrapy/blob/master/1.BasicWithColab/3.Basic_Fetech_List_Articles.ipynb


In [25]:
# 讀取 JSON 檔案
with open('天下雜誌_美國銀行.json', 'r') as f:
    jsonData = json.load(f)

In [23]:
def jsonToDataModel(source, data):
  timestamp = int(datetime.datetime.now().timestamp() * 1000)

  if "tags" in data:
    ','.join(data["tags"])
  else:
    data["tags"] = None

  dataModel = {
    "sourceId":{"$oid": str(source["oid"])},
    "sourceName": source["name"],
    "topic": source["topic"],
    "sourceUrl": data["link"],
    "sourceUpdatedAt": None,
    "sourceAvailable": True,
    "title": data["title"],
    "content": data["content"],
    "featuredImage": data["image"],
    "Images": None,
    "metas":[
      {"metaKey":"tags","metaValue": data["tags"], "available": True}
    ],
    "createdAt":{"$numberLong":str(timestamp)},
    "updatedAt": None,
    "lastSync":{"$numberLong":str(timestamp)},
    "available": True
  }

  return dataModel

In [None]:
# 插入多筆爬蟲取得的 Document 寫到 crawlerData 集合裡
dataCollection = db['crawlerData']

source["topic"] = "美國銀行"

dataDocuments = []
for data in jsonData:
  model = jsonToDataModel(source, data)
  # 將 dict 轉成 MongoDB 的 document 格式
  dataDocuments.append(json.loads(json.dumps(model)))

try:
    # 插入多筆 document 到 collection
    dataResult = dataCollection.insert_many(dataDocuments)
    print('Inserted documents:', dataResult.inserted_ids)
except pymongo.errors.BulkWriteError as e:
    print('Error:', e.details['writeErrors'])

In [76]:
# 使用 update_one 更新 Source 集合指定 id 之文件中的 lastSync 欄位
timestamp = int(datetime.datetime.now().timestamp() * 1000)
updateResult = sourceCollection.update_one(
    {'_id': source["oid"]},
    {'$set': {'lastSync': {"$numberLong":str(timestamp)}}}
)

In [None]:
from bson.objectid import ObjectId
# 讀出多筆相同 source id 之 crawlerData 集合裡的文件
# 以 find 方法查詢(指定顯示欄位)
from bson.objectid import ObjectId

try:
  # print(type(source["oid"]))
  # 因為 sourceId 欄位的資料型別是 BSON ObjectID，在查詢時需要透過 $oid operator 來指定，其它型別的欄位就不用這樣
  findResult = dataCollection.find({"sourceId.$oid": str(source["oid"])}, {"_id": 0, "title": 1, "content": 1})

except Exception as e:
  print("Error:", e)

# 逐一印出結果
for r in findResult:
    print(r)

In [None]:
print(type(source["oid"]))

In [None]:
# 查詢共有多少筆數
#count = sourceCollection.count_documents({"_id": source["oid"]})
count = dataCollection.count_documents({"sourceId.$oid": str(source["oid"])}) #懷疑 sourceId 欄位不是關鍵索引，所以沒法直接用 ObjectId 型別
print("Total number of documents:", count)

In [None]:
# 刪除指定文件 dataCollection 依 sourceId
try:
    print(str(source["oid"]))
    deleteResult = dataCollection.delete_many({"sourceId.$oid": str(source["oid"])})
    print(deleteResult.deleted_count, "Data deleted")
except Exception as e:
    print("Error:", e)