## Domain Classification Challenge

**Goal:** Build a classifier to detect valid main domains from a given dataset with minimal false positives.

**Note:** The subsidiaries listed in the last column of the CSV serve as the source of truth. These entries are pseudonymized in the dataset. The objective is to identify any main domains associated with these subsidiaries.

**Evaluation Criteria:**
- Precision (especially minimizing false positives)
- Use of multiple classification techniques
- Model interpretability
- Clean and modular code

**Dataset:**
- Features include domain-level characteristics, text patterns etc.
- Target: `label` (1 = valid, 0 = invalid)

## Evaluation Metrics

- Precision Score
- Confusion Matrix
- False Positive Rate
- ROC-AUC (Optional)


## Summary

Compare all models based on precision and FPR.

Discuss:
- Which model performed best and why?
- What could be done better with more time/data?


### Feature Engineering

In [30]:
import pandas as pd, numpy as np, re, json, time
from datetime import datetime
from urllib.parse import urlparse
import dns.resolver, whois, tldextract, pycountry
from pathlib import Path

# --------------------------------------------------
# Load file
df = pd.read_csv("domains.csv")
# --------------------------------------------------
# 1. URL-level lexical stats
def url_stats(u):
    if pd.isna(u): 
        return pd.Series({"url_length": np.nan,
                          "path_length": np.nan,
                          "path_token_cnt": np.nan,
                          "path_token_len_avg": np.nan,
                          "path_token_len_max": np.nan})
    p = urlparse(u if "://" in u else "http://"+u)   # tolerate scheme-less
    path = p.path or "/"
    tokens = [t for t in path.split("/") if t]
    token_lens = list(map(len, tokens)) or [0]
    return pd.Series({
        "url_length": len(u),
        "path_length": len(path),
        "path_token_cnt": len(tokens),
        "path_token_len_avg": np.mean(token_lens),
        "path_token_len_max": np.max(token_lens)
    })

url_col = "url" if "url" in df.columns else None
if url_col:
    df = pd.concat([df, df[url_col].apply(url_stats)], axis=1)

# --------------------------------------------------
# 2. Domain token stats
def dom_token_stats(dom):
    parts = re.split(r"[.-]", str(dom))
    lengths = list(map(len, parts)) or [0]
    return pd.Series({
        "dom_token_cnt": len(parts),
        "dom_token_len_avg": np.mean(lengths),
        "dom_token_len_max": np.max(lengths)
    })
df = pd.concat([df, df["domain"].apply(dom_token_stats)], axis=1)

# --------------------------------------------------
# 3. Site-age (months) and update-age (months)  
for raw_col, new_col in [
    ("domain_whois.creation_date", "site_age_months"),
    ("domain_whois.updated_date",  "update_age_months")
]:
    if raw_col in df.columns:
        dt = pd.to_datetime(df[raw_col], errors="coerce")
        months = (dt.dt.year - 1970) * 12 + dt.dt.month
        df[new_col] = months

# --------------------------------------------------
# 4. Rogue-index helper
def rogue_index(series, labels):
    tab = pd.crosstab(series, labels)
    M = labels.sum()
    B = len(labels) - M
    ri = (tab[1]/M) / ((tab[1]/M) + (tab[0]/B))
    return ri.fillna(0).to_dict()

# Rogue for name-server (nets_names), registrar, ASN
if "ip_whois.nets_names" in df.columns:
    ns_map = rogue_index(df["ip_whois.nets_names"], df["label"])
    df["ns_rogue_idx"] = df["ip_whois.nets_names"].map(ns_map).fillna(0)

if "domain_whois.registrar" in df.columns:
    reg_map = rogue_index(df["domain_whois.registrar"], df["label"])
    df["registrar_rogue_idx"] = df["domain_whois.registrar"].map(reg_map).fillna(0)

if "ip_whois.asn_descriptions" in df.columns:
    asn_map = rogue_index(df["ip_whois.asn_descriptions"], df["label"])
    df["asn_rogue_idx"] = df["ip_whois.asn_descriptions"].map(asn_map).fillna(0)

# --------------------------------------------------
# 5. 97.5 % clipping + [0,1] normalisation for new continuous cols
cont_cols = [
    "url_length", "path_length", "path_token_cnt",
    "path_token_len_avg", "path_token_len_max",
    "dom_token_cnt", "dom_token_len_avg", "dom_token_len_max",
    "site_age_months", "update_age_months"
]
for c in cont_cols:
    if c in df.columns:
        upper = np.nanpercentile(df[c], 97.5)
        df[c] = np.where(df[c] > upper, upper, df[c])
        rng = df[c].max() - df[c].min()
        if rng > 0:
            df[c] = (df[c] - df[c].min()) / rng

# --------------------------------------------------
# 6. Save / preview
df_enriched = df.copy()
df_enriched.to_csv("domains_enriched.csv", index=False)
print("New features added:", sorted(set(df_enriched.columns) - set(df.columns) ))


✅  New features added: []


In [33]:
# Adding some more host based features as it feels important to have those features on feature engineering

# --------------------------------------------------------------------
# 0.  Load enriched dataframe 
df = pd.read_csv("domains_enriched.csv")

# Cache folders so we never hammer external services twice
CACHE_DIR = Path("lookups_cache")
CACHE_DIR.mkdir(exist_ok=True)

def cached_json(path):
    if path.exists():
        return json.loads(path.read_text())
    return {}

# --------------------------------------------------------------------
# 1. Site-age and last-update age 
for raw, new in [("domain_whois.creation_date", "site_age_months"),
                 ("domain_whois.updated_date",  "update_age_months")]:
    if raw in df.columns:
        dt = pd.to_datetime(df[raw], errors="coerce")
        months = (dt.dt.year - 1970) * 12 + dt.dt.month
        df[new]  = (months - months.min()) / (months.quantile(0.975) - months.min())
    else:
        print(f"'{raw}' column not found – feature skipped")

# --------------------------------------------------------------------
# 2. Path-level lexical stats
url_source = "url" if "url" in df.columns else None   
if url_source:
    def path_stats(u):
        p  = urlparse(u if "://" in u else "http://"+u)
        toks = [t for t in p.path.split("/") if t]
        lens = [len(t) for t in toks] or [0]
        return pd.Series({
            "path_length"        : len(p.path),
            "path_token_cnt"     : len(toks),
            "path_token_len_avg" : np.mean(lens),
            "path_token_len_max" : np.max(lens)
        })
    new_path = df[url_source].fillna("").apply(path_stats)
    # clip + scale to [0,1]
    for c in new_path.columns:
        upper      = new_path[c].quantile(0.975)
        new_path[c] = np.minimum(new_path[c], upper)
        new_path[c] = (new_path[c] - new_path[c].min())/(upper - new_path[c].min())
    df = pd.concat([df, new_path], axis=1)
else:
    print(" No URL/path column — path stats skipped")

# --------------------------------------------------------------------
# 3. DNS counts  (NS & MX)   – uses dnspython with simple caching
def dns_count(domain, rtype):
    cache_file = CACHE_DIR / f"{rtype}_{domain}.json"
    cache      = cached_json(cache_file)
    if cache:
        return cache["count"]
    try:
        answers = dns.resolver.resolve(domain, rtype, lifetime=3.0)
        count   = len(answers)
    except dns.exception.DNSException:
        count = 0
    cache_file.write_text(json.dumps({"count": count}))
    return count

df["ns_count"] = df["domain"].apply(lambda d: dns_count(d, "NS"))
df["mx_count"] = df["domain"].apply(lambda d: dns_count(d, "MX"))

# clip + log-scale
for col in ["ns_count", "mx_count"]:
    df[col] = np.log1p(df[col])
    df[col] = df[col] / df[col].quantile(0.975)

# --------------------------------------------------------------------
# 4. Country of ASN   (ISO-2 → ISO-3 for modelling)
def asn_country(asn_desc):
    if pd.isna(asn_desc):
        return "UNK"
    match = re.match(r"([A-Z]{2})\s", asn_desc.upper())
    iso2  = match.group(1) if match else "ZZ"
    try:
        return pycountry.countries.get(alpha_2=iso2).alpha_3
    except:
        return "UNK"

if "ip_whois.asn_descriptions" in df.columns:
    df["asn_country"] = df["ip_whois.asn_descriptions"].apply(asn_country)
else:
    print("ASN description column missing — country_of_ASN skipped")

# --------------------------------------------------------------------
# 5. Rogue-index for registrar (if not already)
def rogue_index(series, labels):
    tab = pd.crosstab(series, labels)
    M, B = labels.sum(), len(labels) - labels.sum()
    return ((tab[1]/M) / ((tab[1]/M) + (tab[0]/B))).fillna(0).to_dict()

if "domain_whois.registrar" in df.columns and "registrar_rogue_idx" not in df:
    ri = rogue_index(df["domain_whois.registrar"], df["label"])
    df["registrar_rogue_idx"] = df["domain_whois.registrar"].map(ri).fillna(0)

# --------------------------------------------------------------------
# 6. Save
df.to_csv("domains_enriched_full.csv", index=False)
print("domains_enriched_full.csv created with new host-based features")


⚠️  'domain_whois.creation_date' column not found – feature skipped
⚠️  'domain_whois.updated_date' column not found – feature skipped
⚠️  No URL/path column — path stats skipped
✅  domains_enriched_full.csv created with new host-based features


## Preprocessing Features

In [37]:
def preprocess_features(df, label_col="label"):
    """
    Apply 3-stage preprocessing:
      1. 97.5-percentile clipping + min-max scaling on continuous features
      2. keep rogue-index columns as-is (they're already 0-1)
      3. ensure true binary columns are 0/1 int8
    """
    import numpy as np
    import pandas as pd
    import re
    
    df_proc = df.copy()
    
    # ----------------------------------------------------------
    # 1. Identify feature groups
    rogue_pattern   = re.compile(r"_rogue_idx$")
    binary_pattern  = re.compile(r"(?:_flag$|_binary$|dom_country_tld$)")
    
    numeric_cols = df_proc.select_dtypes(exclude="object").columns.drop(label_col)
    rogue_cols   = [c for c in numeric_cols if rogue_pattern.search(c)]
    binary_cols  = [c for c in numeric_cols if binary_pattern.search(c)]
    cont_cols    = [c for c in numeric_cols if c not in rogue_cols + binary_cols]
    
    # ----------------------------------------------------------
    # 2. Clip & scale continuous variables
    for col in cont_cols:
        series = df_proc[col].astype(float)
        upper  = np.nanpercentile(series, 97.5)
        lower  = np.nanpercentile(series,  0.0)   # keep min (usually 0)
        series_clipped = series.clip(lower, upper)
        rng = upper - lower
        if rng == 0:
            df_proc[col] = 0.0   # constant column
        else:
            df_proc[col] = (series_clipped - lower) / rng
    
    # ----------------------------------------------------------
    # 3. Binary columns cast to int8 (0/1)
    for col in binary_cols:
        df_proc[col] = df_proc[col].astype(int).clip(0, 1).astype("int8")
    
    # Rogue-index columns left untouched, but fill NaNs with 0
    df_proc[rogue_cols] = df_proc[rogue_cols].fillna(0.0)
    
    return df_proc


### Handeling Text based featuers

In [43]:
"""
Feature-engineering pipeline
────────────────────────────
1.  Loads `domains_enriched_full.csv`
2.  Normalises / clips every continuous numeric column to [0,1]
3.  Leaves rogue-index columns unchanged (already 0-1)
4.  Casts obvious binary flags to int8
5.  Vectorises every text column:
      • domain  → char-TF-IDF 3-5-grams
      • huge blobs → word Hashing + SafeSVD(50)
      • remaining text → word TF-IDF + SafeSVD(25)
   SafeSVD skips reduction if the TF-IDF block has <2 features.
6.  Concatenates everything into a single numeric matrix
7.  Writes `domains_feature_ready.csv` (features + label) — ready for
    feature selection or model training.
"""

# ───────────────────────────────────────────────────────── Load data
import pandas as pd, numpy as np, re
from pathlib import Path

df = pd.read_csv("domains_enriched_full.csv")
label_col = "label"

# ───────────────────────────────────────────────────────── Numeric preprocessing
rogue_pattern  = re.compile(r"_rogue_idx$")
binary_pattern = re.compile(r"(?:_flag$|_binary$|dom_country_tld$)")

num_cols_all   = df.select_dtypes(exclude="object").columns.drop(label_col)
rogue_cols     = [c for c in num_cols_all if rogue_pattern.search(c)]
binary_cols    = [c for c in num_cols_all if binary_pattern.search(c)]
cont_cols      = [c for c in num_cols_all if c not in rogue_cols + binary_cols]

def clip_scale(col):
    upper = col.quantile(0.975)
    lower = col.min()
    col   = col.clip(lower, upper)
    rng   = upper - lower
    return (col - lower)/rng if rng else 0.0

df[cont_cols]  = df[cont_cols].apply(clip_scale, axis=0)
df[binary_cols]= df[binary_cols].astype(int).clip(0,1).astype("int8")
df[rogue_cols] = df[rogue_cols].fillna(0.0)

# ───────────────────────────────────────────────────────── Text preprocessing
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from scipy import sparse

class SafeSVD(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=25, min_feats=2):
        self.n_components = n_components; self.min_feats = min_feats; self.svd_ = None
    def fit(self, X, y=None):
        n_feats = X.shape[1]
        if n_feats >= self.min_feats:
            self.svd_ = TruncatedSVD(
                n_components=min(self.n_components, n_feats-1),
                random_state=42).fit(X)
        return self
    def transform(self, X):
        if self.svd_ is None:      # keep original
            return X.toarray() if sparse.isspmatrix(X) else X
        return self.svd_.transform(X)

def block_tfidf(analyzer, ngram, max_feats, svd_comp):
    return Pipeline([
        ("tfidf", TfidfVectorizer(analyzer=analyzer, ngram_range=ngram,
                                  max_features=max_feats, min_df=2,
                                  sublinear_tf=True)),
        ("svd", SafeSVD(svd_comp))
    ])

text_cols = [c for c in df.columns if df[c].dtype == object and c != label_col]
df[text_cols] = df[text_cols].fillna("")

big_blobs = ["contents_info", "ip_whois.nets_description"]
transformers = [
    ("char_dom", block_tfidf("char", (3,5), 20000, 40), "domain")
]

for col in big_blobs:
    if col in text_cols:
        transformers.append((
            f"hash_{col}",
            Pipeline([
                ("hash", HashingVectorizer(analyzer="word",
                                           n_features=2**18,
                                           alternate_sign=False)),
                ("svd", SafeSVD(50))
            ]),
            col
        ))
        text_cols.remove(col)

for col in [c for c in text_cols if c != "domain"]:
    transformers.append((f"tfidf_{col}",
                         block_tfidf("word", (1,2), 8000, 25),
                         col))

transformers.append(("num", StandardScaler(with_mean=False),
                     cont_cols + rogue_cols + binary_cols))

ct = ColumnTransformer(transformers, sparse_threshold=0.3)

# ───────────────────────────────────────────────────────── Fit-transform
X_numeric = ct.fit_transform(df)
y         = df[label_col].values

# ───────────────────────────────────────────────────────── Column names
colnames = []

for name, trans, cols in ct.transformers_:
    if name == "num":
        # numeric passthrough: just keep original column names
        colnames.extend(cols)
        continue

    if isinstance(trans, str) and trans == "drop":
        # nothing came out of this transformer
        continue

    # SafeSVD pipeline
    fitted = trans.named_steps["svd"].svd_
    if fitted is None:          # SVD skipped → single feature
        n_comp = 1
    else:
        n_comp = fitted.n_components
    colnames.extend([f"{name}_{i:02d}" for i in range(n_comp)])
# ───────────────────────────────────────────────────────── Save
df_featready = pd.DataFrame(X_numeric, columns=colnames)
df_featready[label_col] = y
df_featready.to_csv("domains_feature_ready.csv", index=False)
print("domains_feature_ready.csv written — shape:", df_featready.shape)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


✅  domains_feature_ready.csv written — shape: (1090, 525)


## Feature Selection

In [45]:
import pandas as pd
df = pd.read_csv("domains_feature_ready.csv")
X = df.drop(columns="label").values
y = df["label"].values


### LASSO Screening

In [52]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_selection import SelectFromModel
import numpy as np

# ── 1. choose best C on imputed data
imp = SimpleImputer(strategy="constant", fill_value=0.0).fit(X)
X_imp = imp.transform(X)

best_c, best_auc = None, -1
for C in [0.1, 0.2, 0.3, 0.5, 1]:
    auc = cross_val_score(
        LogisticRegression(
            penalty="l1", solver="saga", max_iter=20000, tol=1e-3,
            class_weight="balanced", C=C
        ),
        X_imp, y, cv=5, scoring="roc_auc"
    ).mean()
    if auc > best_auc:
        best_auc, best_c = auc, C

# ── 2. fit final LASSO on imputed X
lasso = LogisticRegression(
    penalty="l1", solver="saga", max_iter=20000, tol=1e-3,
    class_weight="balanced", C=best_c
).fit(X_imp, y)

# ── 3. select features (threshold = mean(|β|))
sfm   = SelectFromModel(lasso, prefit=True, threshold="mean")
mask  = sfm.get_support()
X_lasso = sfm.transform(X_imp)

print(f"Best C: {best_c} | Features kept: {mask.sum()}")


Best C: 0.1 | Features kept: 4


In [53]:
# ...................... JUST A TRY WITH THOSE 4 SELECTED FEATURES ON RANDOM FOREST .................

In [58]:
import pandas as pd, numpy as np
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

df = pd.read_csv("domains_feature_ready.csv")
X, y = df.drop(columns="label"), df["label"]

# Impute NaNs → 0
imp  = SimpleImputer(strategy="constant", fill_value=0.0).fit(X)
X_imp = imp.transform(X)

# LASSO (C = 0.1) keep non-zero β
lasso = LogisticRegression(penalty="l1", solver="saga", C=0.1,
                           max_iter=20000, tol=1e-3,
                           class_weight="balanced").fit(X_imp, y)
mask  = SelectFromModel(lasso, prefit=True, threshold=None).get_support()
X_sel = X_imp[:, mask]

print("Selected features:", X.columns[mask].tolist())

# RF on the 4-feature set
X_tr, X_te, y_tr, y_te = train_test_split(X_sel, y, test_size=0.2,
                                          stratify=y, random_state=42)

rf = RandomForestClassifier(n_estimators=500, max_depth=None,
                            min_samples_leaf=2, class_weight="balanced",
                            random_state=42, n_jobs=-1).fit(X_tr, y_tr)
y_pr  = rf.predict(X_te); y_pb = rf.predict_proba(X_te)[:,1]

tn, fp, fn, tp = confusion_matrix(y_te, y_pr).ravel()
metrics = {
    "Accuracy": accuracy_score(y_te, y_pr),
    "Precision": precision_score(y_te, y_pr, zero_division=0),
    "Recall": recall_score(y_te, y_pr),
    "F1": f1_score(y_te, y_pr),
    "ROC-AUC": roc_auc_score(y_te, y_pb),
    "TN": tn, "FP": fp, "FN": fn, "TP": tp
}
print(metrics)


Selected features: ['lookup_count', 'dom_token_cnt', 'ns_rogue_idx', 'asn_rogue_idx']
{'Accuracy': 0.9724770642201835, 'Precision': 0.9272727272727272, 'Recall': 0.9622641509433962, 'F1': 0.9444444444444444, 'ROC-AUC': 0.9953116066323614, 'TN': 161, 'FP': 4, 'FN': 2, 'TP': 51}


## tHE ABOVE CODE WORKS NICE. EXPLORING GA TO MAKE MODEL FURTHER BETTER

In [60]:
import pandas as pd, numpy as np
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

df = pd.read_csv("domains_feature_ready.csv")
X_all, y = df.drop(columns="label"), df["label"]

# Re-use the imputer so GA sees no NaNs
imp   = SimpleImputer(strategy="constant", fill_value=0.0).fit(X_all)
X_imp = imp.transform(X_all)

# LASSO again, but keep *all* non-zero weights (C=0.3 mild shrinkage)
lasso = LogisticRegression(penalty="l1", solver="saga", C=0.3,
                           max_iter=20000, class_weight="balanced", tol=1e-3
).fit(X_imp, y)
mask_all = (lasso.coef_[0] != 0)

mandatory = ["lookup_count", "dom_token_cnt",
             "ns_rogue_idx", "asn_rogue_idx"]
mand_mask = X_all.columns.isin(mandatory)

# Pool that GA can toggle  (exclude mandatory)
opt_mask   = mask_all & ~mand_mask
X_mand     = X_imp[:, mand_mask]          # will be concatenated back later
X_opt      = X_imp[:, opt_mask]           # GA toggles these columns
opt_names  = X_all.columns[opt_mask]


In [3]:
# ── 0. make sure GeneticSelectionCV exists ─────────────────────────────
import subprocess, sys, importlib
pkg = "sklearn-genetic-opt"
try:
    from sklearn_genetic import GAFeatureSelectionCV
except ImportError:
    print("🔄  upgrading sklearn-genetic-opt …")
    subprocess.run([sys.executable, "-m", "pip", "install", "-U", pkg], check=True)
    importlib.invalidate_caches()
    from sklearn_genetic import GAFeatureSelectionCV
    

# ── 1. load prepared numeric matrix  ───────────────────────────────────
import pandas as pd, numpy as np
from sklearn.impute import SimpleImputer
df = pd.read_csv("domains_feature_ready.csv")
X, y  = df.drop(columns="label"), df["label"]

imp   = SimpleImputer(strategy="constant", fill_value=0.0).fit(X)
X_imp = imp.transform(X)                      # NaNs → 0
cols  = X.columns

# mandatory features from LASSO
mand_cols = ["lookup_count", "dom_token_cnt", "ns_rogue_idx", "asn_rogue_idx"]
mand_mask = cols.isin(mand_cols)
X_mand    = X_imp[:, mand_mask]              # always ON
X_opt     = X_imp[:, ~mand_mask]             # GA toggles these
opt_cols  = cols[~mand_mask]

# ── 2. GA selector on optional pool  ───────────────────────────────────
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

base_est = Pipeline([
    ("scaler", StandardScaler()),
    ("svm"   , SVC(kernel="rbf", probability=True,
                   class_weight="balanced"))
])

ga = GAFeatureSelectionCV(
    estimator              = base_est,
    cv                     = 3,
    scoring                = "roc_auc",
    population_size        = 60,     
    generations            = 40,     
    crossover_probability  = 0.97,    
    mutation_probability   = 0.03,   
    tournament_size        = 3,
    elitism                = True,
    verbose                = True,
    n_jobs                 = -1,
    max_features           = 30
)

ga.fit(X_opt, y)
opt_mask = ga.support_
print(f"GA kept {opt_mask.sum()} optional columns")

# ── 3. concatenate mandatory + GA-chosen optional  ─────────────────────
X_final      = np.hstack([X_mand, X_opt[:, opt_mask]])
final_cols   = list(mand_cols) + opt_cols[opt_mask].tolist()
np.savez("GA_selected_features.npz", X=X_final, y=y)   # quick save
pd.Series(final_cols).to_csv("GA_selected_columns.txt", index=False)

print("Total final features:", len(final_cols))
print("Saved X / y in GA_selected_features.npz and column list in GA_selected_columns.txt")


gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	60    	-53333 	49889.1    	0.760948   	-100000    
1  	120   	-14999.4	35707.4    	0.760948   	-100000    
2  	120   	0.735694	0.0239183  	0.78434    	0.663272   
3  	120   	0.75307 	0.0141759  	0.78924    	0.719488   
4  	120   	0.759047	0.0141954  	0.78924    	0.723211   
5  	120   	0.770273	0.0116112  	0.78924    	0.750493   
6  	120   	0.778921	0.00801652 	0.790542   	0.755177   
7  	120   	0.784034	0.00477284 	0.795291   	0.769801   
8  	120   	0.787741	0.00324655 	0.795291   	0.780264   
9  	120   	0.789722	0.00268794 	0.795881   	0.784172   
10 	120   	0.792394	0.00323867 	0.800449   	0.787936   
11 	120   	0.796135	0.00401657 	0.808701   	0.788255   
12 	120   	0.799399	0.00439804 	0.808701   	0.788152   
13 	120   	0.803301	0.00373139 	0.808701   	0.795291   
14 	120   	0.806579	0.00275679 	0.812175   	0.799705   
15 	120   	0.808344	0.00228389 	0.812175   	0.805025   
16 	120   	0.810507	0.00239526 	0.816801   	0.8064

## Testing new dataframe on ML models

In [5]:
import pandas as pd, numpy as np
from sklearn.impute import SimpleImputer

# full numeric feature matrix
df = pd.read_csv("domains_feature_ready.csv")
X_all, y = df.drop(columns="label"), df["label"]

# list of columns GA chose (mandatory + optional)
final_cols = pd.read_csv("GA_selected_columns.txt", header=None)[0].tolist()

# subset + impute NaNs → 0
imp = SimpleImputer(strategy="constant", fill_value=0.0).fit(X_all[final_cols])
X_sel = imp.transform(X_all[final_cols])


# Random Forest

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)

# Load GA-selected feature matrix (created earlier)
data = np.load('GA_selected_features.npz')
X = data['X']
y = data['y']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Random Forest model
rf = RandomForestClassifier(
    n_estimators=600,
    max_depth=None,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# Evaluate
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

metrics = pd.DataFrame([{
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred),
    'F1': f1_score(y_test, y_pred),
    'ROC-AUC': roc_auc_score(y_test, y_proba),
    'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp,
    'Total features': X.shape[1]
}])

print("Random Forest metrics (GA + mandatory features)", metrics)


Random Forest metrics (GA + mandatory features)    Accuracy  Precision    Recall        F1   ROC-AUC   TN  FP  FN  TP  \
0   0.96789   0.925926  0.943396  0.934579  0.990623  161   4   3  50   

   Total features  
0              33  


In [4]:
import numpy as np, matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, confusion_matrix

# rf  = fitted Random-Forest
# X_te, y_te = hold-out test from earlier

proba = rf.predict_proba(X_test)[:, 1]
prec, rec, thr = precision_recall_curve(y_test, proba)

# choose the smallest threshold with Precision ≥ 0.96
target = 0.96
best_idx = np.argmax((prec >= target) * rec)        # maximise recall under constraint
best_thr = thr[best_idx]
print(f"Threshold {best_thr:.3f}  →  Precision {prec[best_idx]:.3f}  Recall {rec[best_idx]:.3f}")

# evaluate
y_hat = (proba >= best_thr).astype(int)
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
print("FP:", fp, " ⇦ used to be 4")


Threshold 0.844  →  Precision 0.976  Recall 0.774
FP: 1  ⇦ used to be 4


In [14]:
proba = rf.predict_proba(X_test)[:, 1]
prec, rec, thr = precision_recall_curve(y_test, proba)

# put everything in a tbale
table = pd.DataFrame({"thr": thr, "precision": prec[:-1], "recall": rec[:-1]})
table["F0.5"] = 1.25 * prec[:-1] * rec[:-1] / (0.25 * prec[:-1] + rec[:-1])
print(table.head(15))        


         thr  precision  recall      F0.5
0   0.001667   0.243119     1.0  0.286486
1   0.001942   0.244240     1.0  0.287731
2   0.002878   0.245370     1.0  0.288986
3   0.003546   0.246512     1.0  0.290252
4   0.003859   0.247664     1.0  0.291529
5   0.004267   0.248826     1.0  0.292818
6   0.005856   0.250000     1.0  0.294118
7   0.005939   0.251185     1.0  0.295429
8   0.006405   0.252381     1.0  0.296753
9   0.006436   0.253589     1.0  0.298088
10  0.006640   0.254808     1.0  0.299435
11  0.007596   0.256039     1.0  0.300795
12  0.007775   0.257282     1.0  0.302166
13  0.007835   0.258537     1.0  0.303551
14  0.008125   0.259804     1.0  0.304948


In [13]:
hi_prec  = table.query("precision >= 0.93")
print(hi_prec.head(10))      


          thr  precision    recall      F0.5
166  0.713346   0.942308  0.924528  0.938697
167  0.724686   0.941176  0.905660  0.933852
168  0.767744   0.940000  0.886792  0.928854
169  0.796650   0.938776  0.867925  0.923695
170  0.797967   0.958333  0.867925  0.938776
171  0.805829   0.957447  0.849057  0.933610
172  0.810068   0.956522  0.830189  0.928270
173  0.816920   0.955556  0.811321  0.922747
174  0.833576   0.954545  0.792453  0.917031
175  0.835978   0.953488  0.773585  0.911111


In [22]:
# RF on GA selected 33 features with a 0.71 threshold
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)

# Load the ready dataset and the 33 GA‐selected column names
df      = pd.read_csv("domains_feature_ready.csv")
ga_cols = pd.read_csv("GA_selected_columns.txt", header=None)[0].tolist()
X       = df[ga_cols].values
y       = df["label"].values

# Impute any missing → 0, then train/test split
imp   = SimpleImputer(strategy="constant", fill_value=0.0)
X_imp = imp.fit_transform(X)

X_tr, X_te, y_tr, y_te = train_test_split(
    X_imp, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Fit the baseline RandomForest
rf = RandomForestClassifier(
    n_estimators=600,
    class_weight="balanced",
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
).fit(X_tr, y_tr)

# Predict probabilities and apply threshold = 0.71
y_proba     = rf.predict_proba(X_te)[:, 1]
threshold   = 0.713346
y_pred_thr  = (y_proba >= threshold).astype(int)

# Compute metrics & confusion
tn, fp, fn, tp = confusion_matrix(y_te, y_pred_thr).ravel()
metrics = {
    "Accuracy":  accuracy_score(y_te, y_pred_thr),
    "Precision": precision_score(y_te, y_pred_thr),
    "Recall":    recall_score(y_te, y_pred_thr),
    "F1":        f1_score(y_te, y_pred_thr),
    "ROC-AUC":   roc_auc_score(y_te, y_proba),
    "TN": tn, "FP": fp, "FN": fn, "TP": tp
}

print(f"RF w/ threshold={threshold}")
for k, v in metrics.items():
    print(f"{k:<10}: {v:.4f}" if isinstance(v, float) else f"{k:<10}: {v}")

print("\nConfusion matrix:")
print(pd.DataFrame(
    [[tn, fp], [fn, tp]],
    index=["Actual 0", "Actual 1"],
    columns=["Pred 0", "Pred 1"]
))


RF w/ threshold=0.713346
Accuracy  : 0.9633
Precision : 0.9412
Recall    : 0.9057
F1        : 0.9231
ROC-AUC   : 0.9906
TN        : 162
FP        : 3
FN        : 5
TP        : 48

Confusion matrix:
          Pred 0  Pred 1
Actual 0     162       3
Actual 1       5      48


# SVM

In [12]:
import pandas as pd, numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)

# ── 1. Load numeric feature matrix & GA column list ────────────────────
df          = pd.read_csv("domains_feature_ready.csv")
ga_cols     = pd.read_csv("GA_selected_columns.txt", header=None)[0].tolist()
X_ga, y     = df[ga_cols], df["label"]

# ── 2. Preprocess & SVM classifier  ────────────────────────────────────
svm_pipe = Pipeline([
    ("imp",   SimpleImputer(strategy="constant", fill_value=0.0)),
    ("scale", StandardScaler()),
    ("svm",   SVC(kernel="rbf", probability=True, class_weight="balanced"))
])

param_grid = {
    "svm__C":     [1, 5, 10],
    "svm__gamma": ["scale", 0.1, 0.01]
}

svm = GridSearchCV(svm_pipe, param_grid, cv=3, scoring="roc_auc", n_jobs=-1)

# ── 3. Train/test split & fit ──────────────────────────────────────────
X_tr, X_te, y_tr, y_te = train_test_split(
    X_ga, y, test_size=0.2, stratify=y, random_state=42
)

svm.fit(X_tr, y_tr)

# ── 4. Metrics ─────────────────────────────────────────────────────────
y_pr  = svm.predict(X_te)
y_pb  = svm.predict_proba(X_te)[:, 1]
tn, fp, fn, tp = confusion_matrix(y_te, y_pr).ravel()

metrics = {
    "Features":   len(ga_cols),
    "Accuracy":   accuracy_score(y_te, y_pr),
    "Precision":  precision_score(y_te, y_pr, zero_division=0),
    "Recall":     recall_score(y_te, y_pr),
    "F1":         f1_score(y_te, y_pr),
    "ROC-AUC":    roc_auc_score(y_te, y_pb),
    "FP / TP":    (fp, tp),
    "Best C":     svm.best_params_["svm__C"],
    "Best γ":     svm.best_params_["svm__gamma"],
}
print(metrics)


{'Features': 33, 'Accuracy': 0.944954128440367, 'Precision': 0.8360655737704918, 'Recall': 0.9622641509433962, 'F1': 0.8947368421052632, 'ROC-AUC': 0.9918810748999428, 'FP / TP': (10, 51), 'Best C': 1, 'Best γ': 0.01}


In [28]:

# ── 1. Load data ──────────────────────────────────────────────────────────
df      = pd.read_csv("domains_feature_ready.csv")
ga_cols = pd.read_csv("GA_selected_columns.txt", header=None)[0].tolist()

X = df[ga_cols]
y = df["label"]

# ── 2. Split ─────────────────────────────────────────────────────────────
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ── 3. Build pipeline ────────────────────────────────────────────────────
pipe = Pipeline([
    ("imp",   SimpleImputer(strategy="constant", fill_value=0.0)),
    ("scale", StandardScaler()),
    ("svm",   SVC(kernel="rbf", probability=True))
])

# ── 4. Grid over C, γ **and** class-weight ───────────────────────────────
param_grid = {
    "svm__C":            [1, 5, 10],
    "svm__gamma":        ["scale", 0.1, 0.01],
    # penalize false positives by giving class 1 (positives) higher weight
    "svm__class_weight": [
        {0:1,   1:1},    # baseline
        {0:1,   1:2},
        {0:1,   1:5},
        "balanced"
    ]
}

gs = GridSearchCV(
    pipe, param_grid, cv=3,
    scoring="roc_auc", n_jobs=-1, verbose=1
)
gs.fit(X_tr, y_tr)

print("Best params:", gs.best_params_)

# ── 5. Calibrate probabilities ───────────────────────────────────────────
cal = CalibratedClassifierCV(
    gs.best_estimator_, method="isotonic", cv=3
).fit(X_tr, y_tr)

proba = cal.predict_proba(X_te)[:, 1]

# ── 6. Pick threshold for prec ≥ 0.90 but max recall ────────────────────
prec, rec, thr = precision_recall_curve(y_te, proba)
# drop last point where thr is NAN
prec, rec = prec[:-1], rec[:-1]
thr        = thr

mask = prec >= 0.90
best_i = np.argmax(rec[mask])
best_thr = thr[mask][best_i]
print(f"Chosen threshold = {best_thr:.3f}  "
      f"(prec={prec[mask][best_i]:.3f}, rec={rec[mask][best_i]:.3f})")

# ── 7. Final prediction & metrics ───────────────────────────────────────
y_pred = (proba >= best_thr).astype(int)

tn, fp, fn, tp = confusion_matrix(y_te, y_pred).ravel()

print("\n=== Final metrics ===")
print(f"Accuracy : {accuracy_score(y_te, y_pred):.3f}")
print(f"Precision: {precision_score(y_te, y_pred):.3f}")
print(f"Recall   : {recall_score(y_te, y_pred):.3f}")
print(f"F1       : {f1_score(y_te, y_pred):.3f}")
print(f"ROC-AUC  : {roc_auc_score(y_te, proba):.3f}")
print(f"TN / FP / FN / TP : {tn} / {fp} / {fn} / {tp}")

print("\nConfusion matrix:")
display(pd.DataFrame(
    [[tn, fp], [fn, tp]],
    index=["Actual 0","Actual 1"],
    columns=["Pred 0","Pred 1"]
))


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best params: {'svm__C': 1, 'svm__class_weight': {0: 1, 1: 1}, 'svm__gamma': 0.01}
Chosen threshold = 0.548  (prec=0.907, rec=0.925)

=== Final metrics ===
Accuracy : 0.959
Precision: 0.907
Recall   : 0.925
F1       : 0.916
ROC-AUC  : 0.990
TN / FP / FN / TP : 160 / 5 / 4 / 49

Confusion matrix:


Unnamed: 0,Pred 0,Pred 1
Actual 0,160,5
Actual 1,4,49


# LR

In [13]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)

# ── 1. Load GA feature matrix ──────────────────────────────────────────
df       = pd.read_csv("domains_feature_ready.csv")
ga_cols  = pd.read_csv("GA_selected_columns.txt", header=None)[0].tolist()
X_ga, y  = df[ga_cols], df["label"]

# ── 2. Build LR pipeline (impute → scale → LR) ─────────────────────────
lr_pipe = Pipeline([
    ("imp",   SimpleImputer(strategy="constant", fill_value=0.0)),
    ("scale", StandardScaler()),
    ("lr",    LogisticRegression(penalty="l2", solver="saga",
                                 class_weight="balanced",
                                 max_iter=5000))
])

param_grid = {
    "lr__C": [0.1, 0.3, 1, 3, 10]     # strength of regularisation
}

lr = GridSearchCV(lr_pipe, param_grid, cv=3,
                  scoring="roc_auc", n_jobs=-1, verbose=0)

# ── 3. Train/test split & fit ──────────────────────────────────────────
X_tr, X_te, y_tr, y_te = train_test_split(
    X_ga, y, test_size=0.2, stratify=y, random_state=42
)

lr.fit(X_tr, y_tr)

# ── 4. Metrics ─────────────────────────────────────────────────────────
y_pr  = lr.predict(X_te)
y_pb  = lr.predict_proba(X_te)[:, 1]
tn, fp, fn, tp = confusion_matrix(y_te, y_pr).ravel()

metrics = {
    "Features":  len(ga_cols),
    "Accuracy":  accuracy_score(y_te, y_pr),
    "Precision": precision_score(y_te, y_pr, zero_division=0),
    "Recall":    recall_score(y_te, y_pr),
    "F1":        f1_score(y_te, y_pr),
    "ROC-AUC":   roc_auc_score(y_te, y_pb),
    "FP / TP":   (fp, tp),
    "Best C":    lr.best_params_["lr__C"]
}
print(metrics)


{'Features': 33, 'Accuracy': 0.963302752293578, 'Precision': 0.8813559322033898, 'Recall': 0.9811320754716981, 'F1': 0.9285714285714285, 'ROC-AUC': 0.9901658090337335, 'FP / TP': (7, 52), 'Best C': 0.1}


# KNN

In [23]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)

# ── 1.  Load GA feature matrix ─────────────────────────────────────────
df       = pd.read_csv("domains_feature_ready.csv")
ga_cols  = pd.read_csv("GA_selected_columns.txt", header=None)[0].tolist()
X_ga, y  = df[ga_cols], df["label"]

# ── 2.  KNN pipeline: impute → scale → KNN ─────────────────────────────
knn_pipe = Pipeline([
    ("imp",   SimpleImputer(strategy="constant", fill_value=0.0)),
    ("scale", StandardScaler()),
    ("knn",   KNeighborsClassifier(weights="uniform"))
])

param_grid = {
    "knn__n_neighbors": [3, 5, 10, 15, 20, 25, 30],
    "knn__metric":      ["minkowski", "euclidean", "manhattan"]
}

knn = GridSearchCV(knn_pipe, param_grid,
                   cv=3, scoring="roc_auc", n_jobs=-1, verbose=0)

# ── 3.  Train / test split & fit ───────────────────────────────────────
X_tr, X_te, y_tr, y_te = train_test_split(
    X_ga, y, test_size=0.2, stratify=y, random_state=42)

knn.fit(X_tr, y_tr)

# ── 4.  Metrics ────────────────────────────────────────────────────────
y_pr = knn.predict(X_te)
y_pb = knn.predict_proba(X_te)[:, 1]
tn, fp, fn, tp = confusion_matrix(y_te, y_pr).ravel()

metrics = {
    "Features":   len(ga_cols),
    "Accuracy":   accuracy_score(y_te, y_pr),
    "Precision":  precision_score(y_te, y_pr, zero_division=0),
    "Recall":     recall_score(y_te, y_pr),
    "F1":         f1_score(y_te, y_pr),
    "ROC-AUC":    roc_auc_score(y_te, y_pb),
    "FP / TP":    (fp, tp),
    "Best k":     knn.best_params_["knn__n_neighbors"],
    "Metric":     knn.best_params_["knn__metric"]
}
print(metrics)


{'Features': 33, 'Accuracy': 0.944954128440367, 'Precision': 0.9361702127659575, 'Recall': 0.8301886792452831, 'F1': 0.88, 'ROC-AUC': 0.9872498570611778, 'FP / TP': (3, 44), 'Best k': 25, 'Metric': 'manhattan'}


# NB

In [24]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)

# ── 1. Load GA feature matrix ──────────────────────────────────────────
df       = pd.read_csv("domains_feature_ready.csv")
ga_cols  = pd.read_csv("GA_selected_columns.txt", header=None)[0].tolist()
X_ga, y  = df[ga_cols], df["label"]

# ── 2. Build pipeline: impute → GaussianNB  ────────────────────────────
nb_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="constant", fill_value=0.0)),
    ("gnb", GaussianNB())
])

# ── 3. Train/test split & fit ──────────────────────────────────────────
X_tr, X_te, y_tr, y_te = train_test_split(
    X_ga, y, test_size=0.2, stratify=y, random_state=42
)

nb_pipe.fit(X_tr, y_tr)

# ── 4. Metrics ─────────────────────────────────────────────────────────
y_pr = nb_pipe.predict(X_te)
y_pb = nb_pipe.predict_proba(X_te)[:, 1]
tn, fp, fn, tp = confusion_matrix(y_te, y_pr).ravel()

metrics = {
    "Features":  len(ga_cols),
    "Accuracy":  accuracy_score(y_te, y_pr),
    "Precision": precision_score(y_te, y_pr, zero_division=0),
    "Recall":    recall_score(y_te, y_pr),
    "F1":        f1_score(y_te, y_pr),
    "ROC-AUC":   roc_auc_score(y_te, y_pb),
    "FP / TP":   (fp, tp)
}
print(metrics)


{'Features': 33, 'Accuracy': 0.9311926605504587, 'Precision': 0.796875, 'Recall': 0.9622641509433962, 'F1': 0.8717948717948717, 'ROC-AUC': 0.9618067467124072, 'FP / TP': (13, 51)}
