<a href="https://colab.research.google.com/github/kusasyu36/ai-roadmap-2025/blob/main/project_w1/nb_project_w1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# == 自分の GitHub ユーザー名に置き換え ==
GITHUB_USER = "kusasyu36"

# utils をRaw URLから取得
!wget -q -O utils_day5.py https://raw.githubusercontent.com/{GITHUB_USER}/ai-roadmap-2025/main/week01_python_basics/utils_day5.py
!ls -l utils_day5.py

# import & 簡単な動作確認
import importlib, utils_day5 as U
importlib.reload(U)

assert U.is_palindrome("Never odd or even")
assert U.ngram("TOKYO", 2)[:3] == ["TO", "OK", "KY"]  # 文字2-gram例
print("utils imported OK")

-rw-r--r-- 1 root root 3196 Nov  4 08:49 utils_day5.py
utils imported OK


In [2]:
# kaggle.json を所定の場所へ
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Kaggle CLI
!pip -q install kaggle
!kaggle --version

Kaggle API 1.7.4.5


In [3]:
# SMS Spam データを取得
!kaggle datasets download -d uciml/sms-spam-collection-dataset -p ./data
!unzip -o ./data/sms-spam-collection-dataset.zip -d ./data
!ls -l ./data

Dataset URL: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
License(s): unknown
Downloading sms-spam-collection-dataset.zip to ./data
  0% 0.00/211k [00:00<?, ?B/s]
100% 211k/211k [00:00<00:00, 362MB/s]
Archive:  ./data/sms-spam-collection-dataset.zip
  inflating: ./data/spam.csv         
total 704
-rw-r--r-- 1 root root 215934 Sep 20  2019 sms-spam-collection-dataset.zip
-rw-r--r-- 1 root root 503663 Sep 20  2019 spam.csv


In [4]:
import csv

texts = []
labels = []

# SMSは latin-1（ISO-8859-1）のことが多い
with open('./data/spam.csv', 'r', encoding='latin-1', newline='') as f:
    reader = csv.reader(f)
    header = next(reader)
    # 'v1' = label, 'v2' = message
    try:
        label_idx = header.index('v1')
        text_idx  = header.index('v2')
    except ValueError:
        # もしヘッダが崩れても v1=0, v2=1 を仮定
        label_idx, text_idx = 0, 1

    for row in reader:
        if len(row) > text_idx:
            labels.append(row[label_idx])
            texts.append(row[text_idx])

print("samples:", len(texts))
print("first:", texts[0][:60], "...")

samples: 5572
first: Go until jurong point, crazy.. Available only in bugis n gre ...


In [5]:
from collections import Counter
import time

N = min(2000, len(texts))   # 上限を決めて軽く回す
sample = texts[:N]

# 4-1) word_count_en を全件にかけて、頻出語Topを作る
t0 = time.perf_counter()
word_total = Counter()
for s in sample:
    wc = U.word_count_en(s)   # dict[str, int]
    word_total.update(wc)
t1 = time.perf_counter()

top20 = word_total.most_common(20)

# 4-2) 文字2-gramを集計
ng_total = Counter()
for s in sample:
    for ng in U.ngram(s, 2):
        ng_total[ng] += 1
top10_ng = ng_total.most_common(10)

# 4-3) 回文（palindrome）っぽい行の割合（簡易）
pal_count = sum(1 for s in sample if U.is_palindrome(s))
pal_ratio = pal_count / len(sample)

elapsed_ms = (t1 - t0) * 1000

print("Top20 words:", top20[:5], "...")
print("Top10 char-2gram:", top10_ng[:5], "...")
print(f"Palindrome-like lines: {pal_count}/{len(sample)} ({pal_ratio:.2%})")
print(f"Counting time: {elapsed_ms:.1f} ms for {len(sample)} lines")

Top20 words: [('i', 858), ('to', 821), ('you', 821), ('a', 508), ('the', 471)] ...
Top10 char-2gram: [('e ', 3785), ('t ', 2954), (' t', 2768), (' a', 2194), ('s ', 2129)] ...
Palindrome-like lines: 0/2000 (0.00%)
Counting time: 50.0 ms for 2000 lines


In [6]:
# word_count_en: 大文字小文字を統一しているか等
assert U.word_count_en("Hello hello!")["hello"] == 2

# ngram: 文字2-gramの個数チェック（長さ5→4個）
assert len(U.ngram("TOKYO", 2)) == 4

# palindrome: 典型例
assert U.is_palindrome("Never odd or even")

# grade: しきい値付近
assert U.grade(90) == "A"  # 例（あなたの実装に合わせて調整）
assert U.grade(59) != "A"

In [7]:
def measure_time(n):
    from collections import Counter
    import time
    sam = texts[:min(n, len(texts))]
    t0 = time.perf_counter()
    c = Counter()
    for s in sam:
        c.update(U.word_count_en(s))
    t1 = time.perf_counter()
    return (t1 - t0) * 1000

sizes = [100, 500, 1000, 2000]
results = [(n, measure_time(n)) for n in sizes]
results

[(100, 1.533359999939421),
 (500, 7.823503999929926),
 (1000, 15.609421000021939),
 (2000, 26.27602699999443)]