# Address Matching: EDA & Normalization
Bu notebook, **adres veri seti** için keşifsel veri analizi (EDA) ve normalizasyon ön incelemelerini içerir.

**İçerik:**
1) Kurulum & Yol Ayarları
2) Veriyi Yükleme ve İlk Bakış
3) Karakter/Uzunluk İstatistikleri
4) Token Frekansları ve Kısaltmalar
5) İdari Token Kapsaması
6) Label Dağılımı
7) Yakın-Duplicate Keşfi (Jaccard n-gram)
8) Normalizasyon Önizlemesi

> Not: Büyük veri sebebiyle bazı analizler örnekleme ile çalışır. `SAMPLE_SIZE` değişkenini ayarlayabilirsiniz.

In [None]:
# 1) Kurulum & Yol Ayarları
import os, re, math
import pandas as pd
import numpy as np

from collections import Counter
from typing import List, Dict

# Proje yolları
TRAIN_PATH = "/mnt/data/train.csv"   # kendi dizinine göre değiştir
OUTPUT_DIR = "./artifacts/eda"       # notebook köküne göre
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Örnekleme
SAMPLE_SIZE = 200000  # hız için; tüm veri için 0 yap
RANDOM_STATE = 42

# Helper modüller
import sys
sys.path.append("./lib")  # aynı notebook klasöründeki lib/
from normalize_address import normalize_address

print(pd.__version__)

In [None]:
# 2) Veriyi Yükleme ve İlk Bakış
df = pd.read_csv(TRAIN_PATH)
if SAMPLE_SIZE and SAMPLE_SIZE > 0 and SAMPLE_SIZE < len(df):
    df = df.sample(SAMPLE_SIZE, random_state=RANDOM_STATE).reset_index(drop=True)

assert {"address","label"}.issubset(df.columns), "train.csv 'address' ve 'label' sütunları içermeli."

display(df.head(5))
print("Rows:", len(df))
print("Unique address:", df['address'].nunique())
print("Unique label:", df['label'].nunique())
print("Missing % (address, label):", df['address'].isna().mean()*100, df['label'].isna().mean()*100)

In [None]:
# 3) Karakter/Uzunluk İstatistikleri
def char_stats(series: pd.Series) -> pd.DataFrame:
    lengths = series.str.len()
    digits = series.str.count(r"\d")
    alphas = series.str.count(r"[A-Za-zÇĞİÖŞÜçğıöşü]")
    spaces = series.str.count(r"\s")
    puncts = series.str.count(r"[^\w\sçğıöşüÇĞİÖŞÜ]")
    return pd.DataFrame({
        "len": lengths,
        "digits": digits,
        "alphas": alphas,
        "spaces": spaces,
        "puncts": puncts,
        "digit_ratio": (digits / lengths).replace([np.inf, np.nan], 0),
        "alpha_ratio": (alphas / lengths).replace([np.inf, np.nan], 0),
        "space_ratio": (spaces / lengths).replace([np.inf, np.nan], 0),
        "punct_ratio": (puncts / lengths).replace([np.inf, np.nan], 0),
    })

cstats = char_stats(df["address"])
display(cstats.describe(percentiles=[.05,.25,.5,.75,.95]))
cstats.describe().to_csv(os.path.join(OUTPUT_DIR, "01_char_stats_describe.csv"))

extreme = pd.concat([
    df.loc[cstats["len"] <= 15, ["address", "label"]].head(50),
    df.loc[cstats["len"] >= 150, ["address", "label"]].head(50),
])
extreme.to_csv(os.path.join(OUTPUT_DIR, "01b_extreme_lengths_samples.csv"), index=False)
print("Saved: 01_char_stats_describe.csv & 01b_extreme_lengths_samples.csv")

In [None]:
# 4) Token Frekansları ve Kısaltmalar
import re
RE_WS = re.compile(r"\s+")
RE_PUNCT = re.compile(r"[^\w\sçğıöşüÇĞİÖŞÜ/.,-]")
RE_TOKEN = re.compile(r"[A-Za-zÇĞİÖŞÜçğıöşü0-9./-]+")

ABBREV_MAP = {
    "mah": "mahalle","mh": "mahalle","mah.": "mahalle","mahallesi":"mahalle",
    "cad": "caddesi","cd":"caddesi","cad.":"caddesi","cadd.":"caddesi","caddesi":"caddesi",
    "sk":"sokak","sk.":"sokak","sok":"sokak","sok.":"sokak",
    "blv":"bulvarı","blv.":"bulvarı","bulv.":"bulvarı","bulv":"bulvarı","bulvari":"bulvarı","bulvar":"bulvarı",
    "no":"no","no.":"no","kat.":"kat","daire":"daire","dr.":"doktor"
}

def tr_lower(s: str) -> str:
    return s.replace("I","ı").replace("İ","i").lower()

def basic_clean(s: str) -> str:
    s = tr_lower(str(s))
    s = RE_PUNCT.sub(" ", s)
    s = RE_WS.sub(" ", s).strip()
    return s

def expand_abbrev(tokens):
    out = []
    for t in tokens:
        t0 = t.strip(".")
        out.append(ABBREV_MAP.get(t, ABBREV_MAP.get(t0, t)))
    return out

def tokenize(s: str):
    return RE_TOKEN.findall(s)

from collections import Counter
cnt = Counter()
for s in df["address"]:
    toks = expand_abbrev(tokenize(basic_clean(s)))
    cnt.update(toks)

top_tokens = pd.DataFrame(cnt.most_common(200), columns=["token","freq"])
display(top_tokens.head(20))
top_tokens.to_csv(os.path.join(OUTPUT_DIR, "02_top_tokens.csv"), index=False)
print("Saved: 02_top_tokens.csv")

In [None]:
# 5) İdari Token Kapsaması
ADMIN_TOKENS = {"mahalle","caddesi","sokak","bulvarı","no","kat"}

rows = []
for s in df["address"]:
    toks = set(expand_abbrev(tokenize(basic_clean(s))))
    rows.append({f"has_{t}": (t in toks) for t in ADMIN_TOKENS})
cov = pd.DataFrame(rows).mean().rename("coverage").to_frame()
display(cov)
cov.to_csv(os.path.join(OUTPUT_DIR, "03_admin_token_coverage.csv"))
print("Saved: 03_admin_token_coverage.csv")

In [None]:
# 6) Label Dağılımı
vc = df["label"].value_counts()
top_labels = vc.head(20).rename_axis("label").reset_index(name="count")
diversity = 1.0 - np.sum((vc / vc.sum()) ** 2)  # Gini-like

display(top_labels)
print("Label diversity ~", round(diversity,4))

top_labels.to_csv(os.path.join(OUTPUT_DIR, "04_top_labels.csv"), index=False)
with open(os.path.join(OUTPUT_DIR, "04_label_diversity.txt"), "w", encoding="utf-8") as f:
    f.write(f"Gini-like diversity (1=dağılım çeşitli): {diversity:.4f}\n")
print("Saved: 04_top_labels.csv / 04_label_diversity.txt")

In [None]:
# 7) Yakın-Duplicate Keşfi (Jaccard 3-gram)
def char_ngrams(s, n=3):
    s = basic_clean(s).replace(" ","")
    return {s[i:i+n] for i in range(max(0, len(s)-n+1))}

SAMPLE_FOR_DUP = min(20000, len(df))
sample_series = df["address"].sample(SAMPLE_FOR_DUP, random_state=RANDOM_STATE).reset_index(drop=True)
grams = [char_ngrams(s, n=3) for s in sample_series]

pairs = []
thr = 0.90
for i in range(len(sample_series)):
    gi = grams[i]
    for j in range(i+1, min(i+200, len(sample_series))):
        gj = grams[j]
        inter = len(gi & gj); uni = len(gi | gj)
        jacc = inter/uni if uni else 0.0
        if jacc >= thr:
            pairs.append((sample_series[i], sample_series[j], jacc))

dup_df = pd.DataFrame(pairs, columns=["addr_a","addr_b","jaccard"])
display(dup_df.head(20))
dup_df.head(200).to_csv(os.path.join(OUTPUT_DIR, "05_near_duplicates_samples.csv"), index=False)
print("Saved: 05_near_duplicates_samples.csv")

In [None]:
# 8) Normalizasyon Önizlemesi
preview = df.sample(200, random_state=7).copy()
preview["address_norm"] = preview["address"].astype(str).apply(normalize_address)
display(preview.head(10)[["address","address_norm","label"]])
preview.to_csv(os.path.join(OUTPUT_DIR, "06_normalization_preview.csv"), index=False)
print("Saved: 06_normalization_preview.csv")