<a href="https://colab.research.google.com/github/jakalope/meiji-semantic-shift-analysis/blob/main/notebooks/meiji-download-to-drive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Install datasets library if not already
!pip install -q datasets

from datasets import load_dataset
from google.colab import drive
import os
from pathlib import Path
import json

drive.mount('/content/drive')

# Output paths (same as before)
BASE_DIR = Path('/content/drive/MyDrive/meiji-semantic-data')
MEIJI_DIR = BASE_DIR / 'meiji'
MEIJI_DIR.mkdir(parents=True, exist_ok=True)

# Load the dataset
print("Loading globis-university/aozorabunko-clean...")
ds = load_dataset('globis-university/aozorabunko-clean', split='train')

# Optional: Filter for Meiji-era approximation
# Example: Authors born before ~1900 or active in Meiji (manual list or regex on meta)
meiji_authors_keywords = ['夏目漱石', '森鴎外', '二葉亭四迷', '尾崎紅葉', '樋口一葉', '国木田独歩', '徳冨蘆花', '泉鏡花', '島崎藤村', '田山花袋']  # Expand as needed

import re
import json

def meiji_strict(example):
    meta_str = example['meta']
    meta = json.loads(meta_str) if isinstance(meta_str, str) else meta_str
    first_pub = meta.get('初出', '')
    death = meta.get('没年月日', '')
    # Extract year from death if format like YYYY-MM-DD
    death_year_match = re.search(r'^(\d{4})', death)
    death_year = int(death_year_match.group(1)) if death_year_match else 9999
    return '明治' in first_pub or death_year <= 1916

strict_filtered = ds.filter(meiji_strict)
print(f"Strict Meiji filtered: {len(strict_filtered)} examples")

filtered_ds = ds.filter(strict_meiji)
print(f"Strict filtered: {len(filtered_ds)} examples")

# Concatenate text into large files (e.g., ~100k tokens target per file)
texts = filtered_ds['text']
combined_text = '\n\n'.join(texts)  # Or chunk into multiple files

meiji_file = MEIJI_DIR / 'meiji_aozora_combined.txt'
with open(meiji_file, 'w', encoding='utf-8') as f:
    f.write(combined_text)

print(f"Saved Meiji texts to {meiji_file}")
print(f"Approximate characters: {len(combined_text):,}")
# Run your MeCab token counter on this file next

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading globis-university/aozorabunko-clean...


Filter:   0%|          | 0/16951 [00:00<?, ? examples/s]

Strict Meiji filtered: 1879 examples
Strict filtered: 1879 examples
Saved Meiji texts to /content/drive/MyDrive/meiji-semantic-data/meiji/meiji_aozora_combined.txt
Approximate characters: 32,895,183


In [7]:
import random
import json

# Pick 10 random examples from your filtered dataset
random_indices = random.sample(range(len(filtered_ds)), min(10, len(filtered_ds)))

print("Sample of 10 filtered entries (meta fields only):")
for idx in random_indices:
    example = filtered_ds[idx]
    meta = example['meta'] if isinstance(example['meta'], dict) else json.loads(example['meta'])

    print(f"\n--- Example {idx} ---")
    print(f"Title (作品名): {meta.get('作品名', 'N/A')}")
    print(f"Author: {meta.get('姓', '')} {meta.get('名', '')}")
    print(f"Birth/Death: {meta.get('生年月日', 'N/A')} / {meta.get('没年月日', 'N/A')}")
    print(f"First appearance (初出): {meta.get('初出', 'N/A')}")
    print(f"Character type (文字遣い種別): {meta.get('文字遣い種別', 'N/A')}")
    print(f"Work ID: {meta.get('作品ID', 'N/A')}")
    # Optional: print first 100 chars of text to see style
    print(f"Text preview: {example['text'][:150]}...")

Sample of 10 filtered entries (meta fields only):

--- Example 1634 ---
Title (作品名): 世界怪談名作集
Author: マクドナルド ジョージ
Birth/Death: 1824-04-10 00:00:00 / 1905-09-18 00:00:00
First appearance (初出): 
Character type (文字遣い種別): 新字新仮名
Work ID: 043468
Text preview: 　　　　　　一

　コスモ・フォン・ウェルスタールはプラーグの大学生であった。
　彼は貴族の一門であるにもかかわらず、貧乏であった。そうして、貧乏より生ずるところの独立をみずから誇っていた。誰でも貧乏から逃がれることが出来なければ、むしろそれを誇りとするよりほかはないのである。彼は学生仲間に可愛がら...

--- Example 823 ---
Title (作品名): 手紙
Author: 坂本 竜馬
Birth/Death: 1836-01-03 / 1867-12-10
First appearance (初出): 
Character type (文字遣い種別): 新字旧仮名
Work ID: 051783
Text preview: 追白、溝淵広之丞よりさし出し候品ものハ中島作に相頼申候間、御受取可被遣候。彼広之丞誠に先生の御恩をかんじ実ニありがたがり居申候。

再拝〻。
一筆啓上仕候。益御安泰可被成御座候然ニ先頃ハ罷出段〻御セ話難有次第奉万謝候。其節溝淵広之丞ニ御申聞相願候事件を、同国の重役後藤庄次郎一〻相談候より余程夜の明候...

--- Example 815 ---
Title (作品名): 手紙
Author: 坂本 竜馬
Birth/Death: 1836-01-03 / 1867-12-10
First appearance (初出): 
Character type (文字遣い種別): 新字旧仮名
Work ID: 051442
Text preview: 拝啓候。
然ニ昨日鳥渡申上候彼騎銃色〻手を尽し候所、何分手ニ入かね候。先生の御力ニより候ハずバ外ニ術なく御願の為参上仕候。何卒御願申上候。彼筒の代金ハ三十一両より三十三両斗かと存候。うち