In [1]:
import os
import sys
import glob
import pathlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC, NuSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report

In [2]:
ROOT = os.path.abspath("../")
if ROOT not in sys.path:
    sys.path.append(ROOT)

# 1. Data

## Load data

In [3]:
DATA_DIR = os.path.join(ROOT, "data/wm811k/baselines/wmfpr/")
print(*os.listdir(DATA_DIR), sep='\n')

test.csv
train.csv
valid.csv


In [4]:
%%time
df_train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"), index_col=0)
df_valid = pd.read_csv(os.path.join(DATA_DIR, "valid.csv"), index_col=0)
df_test = pd.read_csv(os.path.join(DATA_DIR, "test.csv"), index_col=0)

CPU times: user 2.5 s, sys: 92.2 ms, total: 2.59 s
Wall time: 2.59 s


## Preprocess

In [5]:
def split_dataframe(df: pd.DataFrame):
    y = df.copy()['label']
    X = df.copy().drop('label', axis=1)
    X = X.replace([-np.inf, np.inf], np.nan)
    X = X.fillna(0)
    return X, y

In [6]:
# Split X & y
X_train, y_train = split_dataframe(df_train)
X_valid, y_valid = split_dataframe(df_valid)
X_test, y_test = split_dataframe(df_test)

In [7]:
# Encode string labels to integers
le = LabelEncoder()
le.fit(y_train);
print(*le.classes_, sep=', ')

y_train = le.transform(y_train)
y_valid = le.transform(y_valid)
y_test = le.transform(y_test)

center, donut, edge-loc, edge-ring, loc, near-full, none, random, scratch


In [8]:
def normalize_dataframe(df: pd.DataFrame, scaler):
    df_kwargs = dict(index=df.index, columns=df.columns)
    return pd.DataFrame(scaler.transform(df.values), **df_kwargs)

In [9]:
# Normalize features (X ~ N(0, 1))
scl = StandardScaler()
scl.fit(X_train);

X_train = normalize_dataframe(X_train, scaler=scl)
X_valid = normalize_dataframe(X_valid, scaler=scl)
X_test = normalize_dataframe(X_test, scaler=scl)

# 2. Model

In [10]:
def get_index_subset(y, p: float):
    indices, _ = train_test_split(np.arange(len(y)), train_size=p, stratify=y, random_state=2015010720)
    return indices

In [11]:
def macro_f1(model, X, y):
    from sklearn.metrics import f1_score
    y_pred = model.predict(X)
    return f1_score(y, y_pred, average='macro')

In [12]:
P = [0.01, 0.05, 0.10, 0.25, 0.50, 1.00]

## Random Forest

In [None]:
%%time

rf_kwargs = dict(n_estimators=100, criterion='gini', max_features='sqrt')
param_grid = dict(max_depth=[3, 5, 7, None])

for p in P:
    
    rf = GridSearchCV(
        estimator=RandomForestClassifier(**rf_kwargs),
        param_grid=param_grid,
        scoring='f1_macro',
        cv=5,
        n_jobs=8,
        refit=True,
        verbose=0,
    )
    if p < 1.00:
        indices = get_index_subset(y_train, p=p)
        rf.fit(X_train.iloc[indices], y_train[indices])
    else:
        rf.fit(X_train, y_train)
    print(f"LP={p:.2f}, Test F-1:", f"{macro_f1(rf, X_test, y_test):.4f}")

## Gradient Boosting (sklearn)

In [None]:
%%time

gb_kwargs = dict(n_estimators=100, criterion='friedman_mse', learning_rate=1.0, max_depth=None)
for p in P:
    gb = GradientBoostingClassifier(**gb_kwargs)
    if p < 1.00:
        indices = get_index_subset(y_train, p=p)
        gb.fit(X_train.iloc[indices], y_train[indices])
    else:
        gb.fit(X_train, y_train)
    print(f"LP={p:.2f}, Test F-1:", f"{macro_f1(gb, X_test, y_test):.4f}")

## SVM (Linear)

In [None]:
lsvm_kwargs = {'C': 1.0, 'dual': False, 'class_weight': 'balanced', 'verbose': 0}
for p in P:
    lsvm = LinearSVC(**lsvm_kwargs)
    if p < 1.00:
        indices = get_index_subset(y_train, p=p)
        lsvm.fit(X_train.iloc[indices], y_train[indices])
    else:
        lsvm.fit(X_train, y_train)
    print(f"LP={p:.2f}, Test F-1:", f"{macro_f1(lsvm, X_test, y_test):.4f}")

## Soft Voting

In [16]:
%%time
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
rf_kwargs = dict(n_estimators=100, criterion='gini', max_features='sqrt', max_depth=None)
gb_kwargs = dict(n_estimators=100, criterion='friedman_mse', learning_rate=1.0, max_depth=None)
lg_kwargs = dict(penalty='l2', dual=False, C=1.0, solver='lbfgs')
for p in P:
    vt = VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(**rf_kwargs)),
            ('gb', GradientBoostingClassifier(**gb_kwargs)),
            ('lg', LogisticRegression(**lg_kwargs)),
        ],
        voting='soft',
        weights=[0.767, 0.829, 0.727],
        n_jobs=8,
    )
    if p < 1.00:
        indices = get_index_subset(y_train, p=p)
        vt.fit(X_train.iloc[indices], y_train[indices])
    else:
        vt.fit(X_train, y_train)
    print(f"LP={p:.2f}, Test F-1:", f"{macro_f1(vt, X_test, y_test):.4f}")

LP=0.01, Test F-1: 0.5722
LP=0.05, Test F-1: 0.6961
LP=0.10, Test F-1: 0.7166
LP=0.25, Test F-1: 0.7573
LP=0.50, Test F-1: 0.7850
LP=1.00, Test F-1: 0.8000
CPU times: user 11 s, sys: 11.5 s, total: 22.5 s
Wall time: 33min 19s


## Logistic Regression (CV)

In [None]:
%%time
lg_kwargs = dict(
    Cs=10,  # 10 values in log scale, 1e-4 ~ 1e4.
    cv=5,
    dual=False,
    penalty='l2',
    scoring='f1_macro',
    solver='sag',
    max_iter=100,
    class_weight='balanced',
    n_jobs=8,
    refit=True,
    multi_class='multinomial',
    verbose=1,
)
lg = LogisticRegressionCV(**lg_kwargs)
lg.fit(X_train, y_train)

In [None]:
print("Best C:", lg.C_)
print("Test F-1:", f"{macro_f1(lg, X_test, y_test):.4f}")

In [None]:
print("Test F-1:", f"{macro_f1(gb, X_test, y_test):.4f}")

In [None]:
macro_f1(lsvm, X_train, y_train)