# Address Matching: EDA & Normalization
Bu notebook, **adres veri seti** için keşifsel veri analizi (EDA) ve normalizasyon ön incelemelerini içerir.

**İçerik:**
1) Kurulum & Yol Ayarları
2) Veriyi Yükleme ve İlk Bakış
3) Karakter/Uzunluk İstatistikleri
4) Token Frekansları ve Kısaltmalar
5) İdari Token Kapsaması
6) Label Dağılımı
7) Yakın-Duplicate Keşfi (Jaccard n-gram)
8) Normalizasyon Önizlemesi

> Not: Büyük veri sebebiyle bazı analizler örnekleme ile çalışır. `SAMPLE_SIZE` değişkenini ayarlayabilirsiniz.

In [2]:
# 1) Kurulum & Yol Ayarları
import os, re, math
import pandas as pd
import numpy as np

from collections import Counter
from typing import List, Dict

# Proje yolları
TRAIN_PATH = "C:/Users/BUSRA/source/repos/address-hackathon/data/raw/train.csv"   # kendi dizinine göre değiştir
OUTPUT_DIR = "C:/Users/BUSRA/source/repos/address-hackathon/data/interim/train_eda.csv"       # notebook köküne göre
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Örnekleme
SAMPLE_SIZE = 200000  # hız için; tüm veri için 0 yap
RANDOM_STATE = 42

pd.set_option("display.max_colwidth", 200)
print("Pandas:", pd.__version__)

Pandas: 2.3.1


In [3]:
# 2) Veriyi Yükleme ve İlk Bakış
df = pd.read_csv(TRAIN_PATH)
if SAMPLE_SIZE and SAMPLE_SIZE > 0 and SAMPLE_SIZE < len(df):
    df = df.sample(SAMPLE_SIZE, random_state=RANDOM_STATE).reset_index(drop=True)

assert {"address","label"}.issubset(df.columns), "train.csv 'address' ve 'label' sütunları olmalı."

display(df.head(5))
print("Rows:", len(df))
print("Unique address:", df['address'].nunique())
print("Unique label:", df['label'].nunique())
print("Missing % (address, label):",
      df['address'].isna().mean()*100, df['label'].isna().mean()*100)

# hızlı kayıt
pd.DataFrame([{
    "rows": len(df),
    "unique_address": int(df["address"].nunique()),
    "unique_label": int(df["label"].nunique()),
    "missing_address_%": float(df["address"].isna().mean()*100),
    "missing_label_%": float(df["label"].isna().mean()*100),
}]).to_csv(f"{OUTPUT_DIR}/00_basic_info.csv", index=False)


Unnamed: 0,address,label
0,Mustafa Kemal mah.acı badem yolu no:54 kat 1,7079
1,Çamlıçay mah. 5208 sokak no:17,3565
2,cüneytbey mahallesi kuva i milliye cad .no 1-7c Menderes izmir,2678
3,KASIMPAŞA MH 250. SOKAK NO:45G,2709
4,Yenimahalle 4741 sokak no 9 daire 6 Yunusemre/Manisa,7842


Rows: 200000
Unique address: 199984
Unique label: 10390
Missing % (address, label): 0.0 0.0


In [4]:
# 3) Karakter/Uzunluk İstatistikleri
def char_stats(series: pd.Series) -> pd.DataFrame:
    lengths = series.str.len()
    digits = series.str.count(r"\d")
    alphas = series.str.count(r"[A-Za-zÇĞİÖŞÜçğıöşü]")
    spaces = series.str.count(r"\s")
    puncts = series.str.count(r"[^\w\sçğıöşüÇĞİÖŞÜ]")
    return pd.DataFrame({
        "len": lengths,
        "digits": digits,
        "alphas": alphas,
        "spaces": spaces,
        "puncts": puncts,
        "digit_ratio": (digits / lengths).replace([np.inf, np.nan], 0),
        "alpha_ratio": (alphas / lengths).replace([np.inf, np.nan], 0),
        "space_ratio": (spaces / lengths).replace([np.inf, np.nan], 0),
        "punct_ratio": (puncts / lengths).replace([np.inf, np.nan], 0),
    })

cstats = char_stats(df["address"])
display(cstats.describe(percentiles=[.05,.25,.5,.75,.95]))
cstats.describe().to_csv(f"{OUTPUT_DIR}/01_char_stats_describe.csv")

extreme = pd.concat([
    df.loc[cstats["len"] <= 15, ["address", "label"]].head(50),
    df.loc[cstats["len"] >= 150, ["address", "label"]].head(50),
])
extreme.to_csv(f"{OUTPUT_DIR}/01b_extreme_lengths_samples.csv", index=False)
print("Saved: 01_char_stats_describe.csv & 01b_extreme_lengths_samples.csv")


Unnamed: 0,len,digits,alphas,spaces,puncts,digit_ratio,alpha_ratio,space_ratio,punct_ratio
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,64.925225,5.31851,47.120435,9.82311,2.65859,0.094176,0.71009,0.152337,0.043316
std,24.714287,2.791414,20.478758,4.182452,2.203289,0.064724,0.093781,0.032638,0.035778
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5%,29.0,1.0,17.0,4.0,0.0,0.016393,0.542056,0.107692,0.0
25%,47.0,3.0,32.0,7.0,1.0,0.048193,0.662162,0.131148,0.015873
50%,62.0,5.0,45.0,9.0,2.0,0.083333,0.723404,0.148936,0.038462
75%,80.0,7.0,60.0,12.0,4.0,0.125,0.77451,0.169811,0.064103
95%,109.0,10.0,83.0,17.0,7.0,0.205882,0.831933,0.206897,0.108696
max,250.0,28.0,208.0,130.0,33.0,1.0,1.0,0.878378,0.8


Saved: 01_char_stats_describe.csv & 01b_extreme_lengths_samples.csv


In [5]:
# 4) Token Frekansları ve Kısaltmalar
RE_WS = re.compile(r"\s+")
RE_PUNCT = re.compile(r"[^\w\sçğıöşüÇĞİÖŞÜ/.,-]")
RE_TOKEN = re.compile(r"[A-Za-zÇĞİÖŞÜçğıöşü0-9./-]+")

ABBREV_MAP = {
    "mah": "mahalle","mh": "mahalle","mah.": "mahalle","mahallesi":"mahalle",
    "cad": "caddesi","cd":"caddesi","cad.":"caddesi","cadd.":"caddesi","caddesi":"caddesi",
    "sk":"sokak","sk.":"sokak","sok":"sokak","sok.":"sokak",
    "blv":"bulvarı","blv.":"bulvarı","bulv.":"bulvarı","bulv":"bulvarı","bulvari":"bulvarı","bulvar":"bulvarı",
    "no":"no","no.":"no","kat.":"kat","daire":"daire","dr.":"doktor"
}

def tr_lower(s: str) -> str:
    return s.replace("I","ı").replace("İ","i").lower()

def basic_clean(s: str) -> str:
    s = tr_lower(str(s))
    s = RE_PUNCT.sub(" ", s)
    s = RE_WS.sub(" ", s).strip()
    return s

def expand_abbrev(tokens):
    out = []
    for t in tokens:
        t0 = t.strip(".")
        out.append(ABBREV_MAP.get(t, ABBREV_MAP.get(t0, t)))
    return out

def tokenize(s: str):
    return RE_TOKEN.findall(s)

cnt = Counter()
for s in df["address"]:
    toks = expand_abbrev(tokenize(basic_clean(s)))
    cnt.update(toks)

top_tokens = pd.DataFrame(cnt.most_common(200), columns=["token","freq"])
display(top_tokens.head(20))
top_tokens.to_csv(f"{OUTPUT_DIR}/02_top_tokens.csv", index=False)
print("Saved: 02_top_tokens.csv")


Unnamed: 0,token,freq
0,mahalle,147093
1,no,147022
2,sokak,114421
3,daire,59365
4,izmir,58409
5,kat,50461
6,caddesi,44952
7,2,30486
8,1,29999
9,3,26387


Saved: 02_top_tokens.csv


In [6]:
# 5) İdari Token Kapsaması
ADMIN_TOKENS = {"mahalle","caddesi","sokak","bulvarı","no","kat"}

rows = []
for s in df["address"]:
    toks = set(expand_abbrev(tokenize(basic_clean(s))))
    rows.append({f"has_{t}": (t in toks) for t in ADMIN_TOKENS})

cov = pd.DataFrame(rows).mean().rename("coverage").to_frame()
display(cov)
cov.to_csv(f"{OUTPUT_DIR}/03_admin_token_coverage.csv")
print("Saved: 03_admin_token_coverage.csv")


Unnamed: 0,coverage
has_no,0.692305
has_caddesi,0.22287
has_mahalle,0.650465
has_kat,0.24506
has_sokak,0.56581
has_bulvarı,0.046365


Saved: 03_admin_token_coverage.csv


In [7]:
# 6) Label Dağılımı
vc = df["label"].value_counts()
top_labels = vc.head(20).rename_axis("label").reset_index(name="count")
diversity = 1.0 - np.sum((vc / vc.sum()) ** 2)  # Gini-benzeri çeşitlilik

display(top_labels)
print("Label diversity ~", round(diversity,4))

top_labels.to_csv(f"{OUTPUT_DIR}/04_top_labels.csv", index=False)
with open(f"{OUTPUT_DIR}/04_label_diversity.txt", "w", encoding="utf-8") as f:
    f.write(f"Gini-like diversity (1=dağılım çeşitli): {diversity:.4f}\n")
print("Saved: 04_top_labels.csv / 04_label_diversity.txt")


Unnamed: 0,label,count
0,5414,103
1,5591,102
2,6272,95
3,3656,92
4,5954,83
5,1636,83
6,1543,81
7,6126,76
8,1893,73
9,5746,73


Label diversity ~ 0.9999
Saved: 04_top_labels.csv / 04_label_diversity.txt


In [8]:
# 7) Yakın-Duplicate Keşfi (Jaccard 3-gram)
def char_ngrams(s, n=3):
    s = basic_clean(s).replace(" ","")
    return {s[i:i+n] for i in range(max(0, len(s)-n+1))}

SAMPLE_FOR_DUP = min(20_000, len(df))
sample_series = df["address"].sample(SAMPLE_FOR_DUP, random_state=RANDOM_STATE).reset_index(drop=True)
grams = [char_ngrams(s, n=3) for s in sample_series]

pairs = []
thr = 0.90        # daha az/güçlü için 0.85/0.95 ile oynayabilirsin
window = 200      # brute-force pencere; hız/derinlik dengesi
for i in range(len(sample_series)):
    gi = grams[i]
    for j in range(i+1, min(i+window, len(sample_series))):
        gj = grams[j]
        inter = len(gi & gj); uni = len(gi | gj)
        jacc = inter/uni if uni else 0.0
        if jacc >= thr:
            pairs.append((sample_series[i], sample_series[j], jacc))

dup_df = pd.DataFrame(pairs, columns=["addr_a","addr_b","jaccard"])
display(dup_df.head(20))
dup_df.head(200).to_csv(f"{OUTPUT_DIR}/05_near_duplicates_samples.csv", index=False)
print("Saved: 05_near_duplicates_samples.csv")


Unnamed: 0,addr_a,addr_b,jaccard


Saved: 05_near_duplicates_samples.csv


In [9]:
# 8) Normalizasyon Fonksiyonu (inline) + Önizleme
import unicodedata

RE_WS2 = re.compile(r"\s+")
RE_PUNCT_SAFE2 = re.compile(r"[^\w\sçğıöşüÇĞİÖŞÜ/.,-]")
RE_MULTI_PUNCT2 = re.compile(r"[.,/-]{2,}")
RE_NUM_GAP2 = re.compile(r"(?<=\d)\s+(?=\d)")
RE_NO_COLON2 = re.compile(r"\bno\s*[:=]\s*", re.IGNORECASE)

ABBREV2 = {
    "mah.": "mahalle","mh.":"mahalle","mah":"mahalle","mahallesi":"mahalle",
    "cad.":"caddesi","cd.":"caddesi","cad":"caddesi","caddesi":"caddesi","cadd.":"caddesi",
    "sk.":"sokak","sok.":"sokak","sk":"sokak","sok":"sokak","sokağı":"sokak","sokaği":"sokak",
    "blv.":"bulvarı","blv":"bulvarı","bulv.":"bulvarı","bulv":"bulvarı","bulvari":"bulvarı","bulvar":"bulvarı",
    "no.":"no","no":"no","kat.":"kat","daire":"daire","dr.":"doktor"
}

def tr_lower2(s: str) -> str:
    return s.replace("I","ı").replace("İ","i").lower()

def normalize_address_inline(s: str) -> str:
    if not isinstance(s, str):
        s = str(s)
    s = unicodedata.normalize("NFKC", s)
    s = tr_lower2(s)
    s = RE_PUNCT_SAFE2.sub(" ", s)
    s = RE_NO_COLON2.sub("no ", s)
    s = RE_MULTI_PUNCT2.sub(lambda m: m.group(0)[0], s)
    s = RE_WS2.sub(" ", s).strip()
    s = RE_NUM_GAP2.sub("", s)
    toks = s.split()
    out = []
    for t in toks:
        key = t.strip(".")
        out.append(ABBREV2.get(t, ABBREV2.get(key, t)))
    return " ".join(out)

preview = df.sample(min(200, len(df)), random_state=7).copy()
preview["address_norm"] = preview["address"].astype(str).apply(normalize_address_inline)
display(preview.head(10)[["address","address_norm","label"]])
preview.to_csv(f"{OUTPUT_DIR}/06_normalization_preview.csv", index=False)
print("Saved: 06_normalization_preview.csv")


Unnamed: 0,address,address_norm,label
190687,Zafer Cd. K:1 d:4 no:62,zafer caddesi k 1 d 4 no 62,3315
188397,MUSTAFA KEMAL MAH İZMİR 35 TR TR35 6794 sk. Ketenci sitesi Gonca apt. No:10K,mustafa kemal mahalle izmir 35 tr tr356794 sokak ketenci sitesi gonca apt. no 10k,7004
172427,6052 SOKAK BEHÇET APARTMANI NO 2 KAT 1 DAİRE 1 DEDEBAŞI,6052 sokak behçet apartmanı no 2 kat 1 daire 1 dedebaşı,6378
13122,Taşyaka mah. 141 sok. 20/2 Fethiye Fethiye,taşyaka mahalle 141 sokak 20/2 fethiye fethiye,8827
11684,Çarşı mah. 1000 sokak no7 kat2 muğlabodrum orçun diamond,çarşı mahalle 1000 sokak no7 kat2 muğlabodrum orçun diamond,9704
174115,İsa Yusuf Alptekin cad 1025 sokak 141 blok no 7 kat 1 daire 7 sevgi apt,isa yusuf alptekin caddesi 1025 sokak 141 blok no 7 kat 1 daire 7 sevgi apt,7069
77770,Atatürk Mah. Karaoğlanoğlu caddesi No.29 Demirhan Apartmanı Kat.5 D.4 Çatı Katı ORTACA / MUĞLA,atatürk mahalle karaoğlanoğlu caddesi no.29 demirhan apartmanı kat.5 d.4 çatı katı ortaca / muğla,9336
156321,"Doğanay Mh. Doğanay, 9004. Sk. atak apartmani no :22 daire:7 kat:2 35160 Karabağlar/İzmir, Türkiye KARABAĞLAR İZMİR","doğanay mahalle doğanay, 9004. sokak atak apartmani no 22 daire 7 kat 235160 karabağlar/izmir, türkiye karabağlar izmir",4354
148107,Bahçelievler mahallesi inlitepe caddesi ‘V-marine sitesi’ daire:323 12.kapı Daire:323,bahçelievler mahalle inlitepe caddesi v-marine sitesi daire 32312.kapı daire 323,9584
171320,855 SOKAK ATATÜRK MAHALLESİ NO 10/2,855 sokak atatürk mahalle no 10/2,6521


Saved: 06_normalization_preview.csv
