In [1]:
!pip install pandas openpyxl



### Imports:

In [2]:
import pandas as pd
import json
import os

### Upload Excel file



In [23]:
from google.colab import files
uploaded = files.upload()

### Set up environment (read the file, create output folder)


In [4]:
excel_filename = list(uploaded.keys())[0]

In [5]:
df = pd.read_excel(excel_filename, dtype={'Page ID': str})

In [6]:
output_folder = "com_links"
os.makedirs(output_folder, exist_ok=True)

In [7]:
df.columns = df.columns.str.strip()
print("Column Names Detected:")
print(df.columns.tolist())

Column Names Detected:
['URL', 'Result Block Title', 'Result Block Author', 'Result Block Text', 'Details', 'Page ID']


### Save information from the excel file into separate json files for data processing later.

In [8]:
for index, row in df.iterrows():
    page_id = str(row['Page ID']).strip()

    # Prepare dictionary: convert datetime and other non-serializable objects to strings
    json_data = {
        "url": str(row['URL']),
        "page_id": str(row['Page ID']),
        "result_block_title": str(row['Result Block Title']),
        "result_block_author": str(row['Result Block Author']),
        "result_block_text": str(row['Result Block Text']),
        "details": str(row['Details'])
    }

    # Save JSON
    json_filename = os.path.join(output_folder, f"{page_id}.json")
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False, indent=4)

print(f"✅ All JSON files saved in '{output_folder}'.")

✅ All JSON files saved in 'com_links'.


### Add Page Rank Data

The jsons do not have the Page Rank data that I collected. This following section adds the page rank data from the search results. 0 means Featured Snippet

In [12]:

uploaded = files.upload()
txt_file_name = list(uploaded.keys())[0]

print(f"Uploaded: {txt_file_name}")

id_to_ranks = {}



Saving page_rankings.txt to page_rankings.txt
Uploaded: page_rankings.txt


### Parse the Txt File

In [13]:
with open(txt_file_name, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        # Each line format: "<WEBPAGE_ID> <RANK> <SEARCH_TERM>"
        parts = line.split(" ", 2)
        if len(parts) < 3:
            continue  # skip malformed lines
        webpage_id, rank_value, search_term = parts[0], parts[1], parts[2]
        entry = {
            "search_query": search_term,
            "rank": rank_value
        }
        if webpage_id not in id_to_ranks:
            id_to_ranks[webpage_id] = []
        id_to_ranks[webpage_id].append(entry)


### Add data to jsons

In [14]:
json_folder = "./com_links"

for webpage_id, rank_entries in id_to_ranks.items():
    json_path = os.path.join(json_folder, f"{webpage_id}.json")

    if not os.path.exists(json_path):
        print(f"Warning: JSON file {json_path} does not exist, skipping.")
        continue

    with open(json_path, "r", encoding="utf-8") as jf:
        try:
            data = json.load(jf)
        except json.JSONDecodeError:
            print(f"Warning: Could not decode JSON in {json_path}, skipping.")
            continue

    # Add or update "result_rank"
    if "result_rank" not in data:
        data["result_rank"] = []
    data["result_rank"].extend(rank_entries)

    with open(json_path, "w", encoding="utf-8") as jf:
        json.dump(data, jf, ensure_ascii=False, indent=2)

print("✅ All matching JSON files updated!")

✅ All matching JSON files updated!


# Part 2: Scraping

In [15]:
!pip install beautifulsoup4 requests chardet



In [16]:
import os
import json
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import chardet
import time

### Setup

In [17]:
input_folder = 'com_links'

In [18]:
output_folder = 'com_links_scraped'
os.makedirs(output_folder, exist_ok=True)

### Data collection section

In [19]:
print(f"Starting scraping... Total files: {len(os.listdir(input_folder))}")

for idx, filename in enumerate(os.listdir(input_folder), 1):
    if filename.endswith('.json'):
        input_filepath = os.path.join(input_folder, filename)
        output_filepath = os.path.join(output_folder, filename)

        with open(input_filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)

        url = data.get('URL') or data.get('url')
        if not url:
            print(f"[{idx}] {filename}: No URL found, skipping.")
            continue

        # --- Skip PDFs ---
        if url.lower().endswith('.pdf'):
            print(f"[{idx}] {filename}: URL is a PDF, skipping.")
            continue

        print(f"\n[{idx}] Processing: {filename}")
        print(f"    URL: {url}")

        try:
            # --- Track timing ---
            start_time = time.time()

            # GET request
            response = requests.get(url, timeout=20, headers={'User-Agent': 'Mozilla/5.0'})
            download_time = time.time() - start_time

            # --- Encoding detection ---
            detected = chardet.detect(response.content)
            encoding = detected['encoding'] if detected['confidence'] > 0.5 else 'utf-8'
            print(f"    Downloaded in {download_time:.2f}s | Encoding: {encoding}")

            # --- Parsing ---
            soup = BeautifulSoup(response.content, 'html.parser', from_encoding=encoding)

            # --- Extract fields ---
            title = soup.title.string.strip() if soup.title else None

            pub_date = None
            pub_date_tag = soup.find('meta', property='article:published_time') or soup.find('meta', attrs={'name': 'pubdate'})
            if pub_date_tag and pub_date_tag.get('content'):
                pub_date = pub_date_tag['content'].strip()

            # Main text
            paragraphs = [p.get_text(strip=True) for p in soup.find_all('p')]
            headings = [h.get_text(strip=True) for h in soup.find_all(['h1', 'h2', 'h3'])]
            main_text = "\n".join(headings + paragraphs)

            # Embedded media
            embedded_media = []
            for iframe in soup.find_all('iframe', src=True):
                embedded_media.append(iframe['src'])
            for video in soup.find_all('video', src=True):
                embedded_media.append(video['src'])

            # Add website_data
            data['website_data'] = {
                'webpage_title': title,
                'publication_date': pub_date,
                'main_text': main_text[:10000],
                'embedded_media': embedded_media
            }

            # Save updated JSON
            with open(output_filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=4)

            print(f"    ✅ Completed and saved to '{output_folder}/{filename}'")

        except Exception as e:
            print(f"    ❌ Failed to fetch {url}: {e}")

print(f"\n✅ Scraping complete! JSONs saved in '{output_folder}'.")


Starting scraping... Total files: 81

[1] Processing: 3081280474.json
    URL: https://www.zgggws.com/article/id/17481?viewType=HTML
    Downloaded in 0.43s | Encoding: utf-8
    ✅ Completed and saved to 'com_links_scraped/3081280474.json'

[2] Processing: 5899489299.json
    URL: https://zhuanlan.zhihu.com/p/539664390
    Downloaded in 0.39s | Encoding: utf-8
    ✅ Completed and saved to 'com_links_scraped/5899489299.json'

[3] Processing: 7234679320.json
    URL: https://news.un.org/zh/story/2019/09/1041062
    Downloaded in 2.20s | Encoding: utf-8
    ✅ Completed and saved to 'com_links_scraped/7234679320.json'

[4] Processing: 3129068608.json
    URL: https://www.baidu.com/ssid=c86adcb0/from=844b/s?word=%E6%80%8E%E4%B9%88%E8%87%AA%E6%9D%80&ts=7718745&t_kt=0&ie=utf-8&fm_kl=021394be2f&rsv_iqid=2195213036-6&rsv_t=7ef1MwY6IPilTkA%252FcWv8ZDgshr2Im58iJ4sqPBjODrBI%252BbZ8vLGoZsuAmw&sa=ib&ms=1&rsv_pq=2195213036&rsv_sug4=27874&ss=101&inputT=26056
    Downloaded in 1.17s | Encoding: utf-8
 



    Downloaded in 1.50s | Encoding: utf-8
    ✅ Completed and saved to 'com_links_scraped/9702304018.json'
[62] 5330054929.json: URL is a PDF, skipping.

[63] Processing: 4621844064.json
    URL: https://zh.wikipedia.org/zh-hk/%E8%87%AA%E6%9D%80%E6%96%B9%E5%BC%8F
    Downloaded in 1.69s | Encoding: utf-8
    ✅ Completed and saved to 'com_links_scraped/4621844064.json'

[64] Processing: 6889589666.json
    URL: http://www.chinapeace.gov.cn/chinapeace/c100007/2021-08/23/content_12527450.shtml
    Downloaded in 0.50s | Encoding: utf-8
    ✅ Completed and saved to 'com_links_scraped/6889589666.json'

[65] Processing: 1206003000.json
    URL: https://www.pkuh6.cn/Html/News/Articles/3642.html
    Downloaded in 5.02s | Encoding: ascii
    ✅ Completed and saved to 'com_links_scraped/1206003000.json'

[66] Processing: 6060248314.json
    URL: https://www.dcard.tw/f/mood/p/235861918
    Downloaded in 0.04s | Encoding: ascii
    ✅ Completed and saved to 'com_links_scraped/6060248314.json'

[67] P

### Cleanup -- lots of websites not detected by Beautifulsoup

In [20]:
# Paths to your folders
com_links_folder = 'com_links'
scraped_folder = 'com_links_scraped'

# Get list of filenames in each folder
com_links_files = set(os.listdir(com_links_folder))
scraped_files = set(os.listdir(scraped_folder))

# Find files in com_links not in scraped
missing_files = com_links_files - scraped_files

# Output
print(f"Total files in '{com_links_folder}': {len(com_links_files)}")
print(f"Total files in '{scraped_folder}': {len(scraped_files)}")
print(f"\nFiles not yet scraped ({len(missing_files)}):")

for filename in sorted(missing_files):
    print(filename)

Total files in 'com_links': 81
Total files in 'com_links_scraped': 70

Files not yet scraped (11):
0234222609.json
1249805490.json
2074393071.json
5330054929.json
5465282865.json
6680502017.json
7422813570.json
7695045207.json
8197407423.json
8750036588.json
9968944052.json


### Need to add manual scraping into the jsons

In [21]:
import shutil

In [22]:
com_links_folder = 'com_links'
scraped_folder = 'com_links_scraped'

In [24]:
uploaded = files.upload()
excel_file2 = list(uploaded.keys())[0]
df = pd.read_excel(excel_file2)

Saving manual_scrape.xlsx to manual_scrape.xlsx


In [25]:
print(f"Loaded Excel with {len(df)} rows")

# Process each row
for idx, row in df.iterrows():
    file_id = str(row['ID']).strip()
    page_title = str(row['Page Title']).strip() if pd.notnull(row['Page Title']) else None
    page_body = str(row['Page Body']).strip() if pd.notnull(row['Page Body']) else None

    filename = f"{file_id}.json"
    input_path = os.path.join(com_links_folder, filename)
    output_path = os.path.join(scraped_folder, filename)

    # Check if exists in com_links folder
    if os.path.exists(input_path):
        print(f"[{idx+1}] Processing {filename}")

        # Load JSON
        with open(input_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Add website_data
        data['website_data'] = {
            'webpage_title': page_title,
            'publication_date': None,
            'main_text': page_body[:10000] if page_body else '',
            'embedded_media': []
        }

        # Delete existing file in scraped folder if exists
        if os.path.exists(output_path):
            os.remove(output_path)
            print(f"    Existing file in scraped folder removed: {output_path}")

        # Save updated JSON to scraped folder
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)

        print(f"    ✅ Saved updated JSON to: {output_path}")

    else:
        print(f"[{idx+1}] File {filename} not found in '{com_links_folder}', skipping.")

print("\n✅ All matching files processed.")

Loaded Excel with 27 rows
[1] Processing 3081280474.json
    Existing file in scraped folder removed: com_links_scraped/3081280474.json
    ✅ Saved updated JSON to: com_links_scraped/3081280474.json
[2] Processing 1512327726.json
    Existing file in scraped folder removed: com_links_scraped/1512327726.json
    ✅ Saved updated JSON to: com_links_scraped/1512327726.json
[3] Processing 7695045207.json
    ✅ Saved updated JSON to: com_links_scraped/7695045207.json
[4] Processing 8812631163.json
    Existing file in scraped folder removed: com_links_scraped/8812631163.json
    ✅ Saved updated JSON to: com_links_scraped/8812631163.json
[5] Processing 5010859106.json
    Existing file in scraped folder removed: com_links_scraped/5010859106.json
    ✅ Saved updated JSON to: com_links_scraped/5010859106.json
[6] Processing 7422813570.json
    ✅ Saved updated JSON to: com_links_scraped/7422813570.json
[7] Processing 8197407423.json
    ✅ Saved updated JSON to: com_links_scraped/8197407423.json


### Download Cleaned Data

In [26]:
folder_to_zip = 'com_links_scraped'
output_zip = 'com_links_scraped.zip'

# Create zip file
shutil.make_archive('com_links_scraped', 'zip', folder_to_zip)
print(f"✅ Folder '{folder_to_zip}' zipped as '{output_zip}'")

# Download zip file
files.download(output_zip)

✅ Folder 'com_links_scraped' zipped as 'com_links_scraped.zip'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>