## 🐪 Kaust Academy: Classifying Math Probems
Goal: to use my NLP background to produce a top solution by f1 for kaggle.

In [86]:
## hyperparameters
max_n_gram = 3
is_augment = False

### Environment Setup

In [1]:
%%capture
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import spacy
from spacy import displacy
from tqdm import tqdm
import re
from collections import Counter as count

from sklearn.metrics import f1_score as f1, accuracy_score as accuracy
import pickle as pkl
from sklearn.linear_model import RidgeClassifier
from lightgbm import LGBMRegressor, LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

### Tools

In [65]:
import subprocess

def run(command, shell = False):
  ''' run the specified command via subprocess.run '''
  if not shell: command = command.split(" ")
  process = subprocess.Popen(command, shell = shell) 
  process.wait()

### Data Load

In [5]:
df_tr = pd.read_csv("train-data/train.csv")
df_te = pd.read_csv("train-data/test.csv")

In [63]:
nlp = spacy.load("en_core_web_lg")

In [227]:
# save data as a sparse matrix

save_data = True
if save_data:   
    from scipy.sparse import csr_matrix
    X_sparse  = csr_matrix(X.values)
    with open(f"processed-data/X{max_n_gram}_no_stop.pkl", "wb") as f:
        pkl.dump(X_sparse, f)
    with open(f"processed-data/cols{max_n_gram}_no_stop.pkl", "wb") as f:
        pkl.dump(X.columns, f)

load_data = False
if load_data:
    with open(f"processed-data/X{max_n_gram}_1.pkl", "rb") as f:
        X_sparse = pkl.load(f)
    with open(f"processed-data/cols{max_n_gram}_1.pkl", "rb") as f:
        columns = pkl.load(f)

    X = pd.DataFrame.sparse.from_spmatrix(X_sparse)
    X.columns = columns

load_semi_supervised = False
if load_semi_supervised:
    with open(f"processed-data/X{max_n_gram}_v.pkl", "rb") as f:
        X_sparse = pkl.load(f)
    with open(f"processed-data/cols{max_n_gram}_v.pkl", "rb") as f:
        columns = pkl.load(f)

    X_v = pd.DataFrame.sparse.from_spmatrix(X_sparse)
    X_v.columns = columns

df_ig = pd.read_csv("Feature_Importance_Aug.csv")

### Definitions

In [14]:
def select_top_n(df_ig, by_class = True, n = 500):
    ''' 
    returns a column set of:
    - the top "n" columns by IG_ovr if by_class == False
    - the top "n" columns by IG{n} for n on [0,8) if by_class == True
    '''

    columns = set()

    if by_class:
        for label in range(8):

            df_ig.sort_values(f"IG{label}", ascending = False, inplace = True)
            columns.update(set(df_ig.col[:n].tolist()))

    else:
        df_ig.sort_values(f"IG_ovr", ascending = False, inplace = True)
        columns.update(set(df_ig.col[:n].tolist()))

    return list(columns)

### Pipeline

In [9]:
## Hyperparameters
max_n_gram = 3

In [221]:

bad_pos = set(
    ["PUNCT", "SPACE"]
)

replacements = {
    "$" : "",
    "+" : " + ",
    "-" : " - ",
    "*" : " * ",
    "=" : " = ",
    "^" : " ^ ",
    "\cos" : " \cos ",
    "\sin" : " \sin ",
    r"\tan" : r" \tan ",
    "\sqrt" : " \sqrt ",
    "{" : " { ",
    "}" : " } ",
    "\\" : " ",
    ")" : " ",
    "(" : " ",
    "'" : " ",
    "]" : " ",
    "[" : " "
}

regex_replacements = {
    "[\d.,]+" : " NUMX "
}

def preprocess(df_tr, chunk_size = 50):
    ''' encodes the sentences in df_tr as bag-of-words tf-idf features '''

    X = pd.DataFrame()
    _len = len(df_tr)

    for i,(txt,label) in tqdm(enumerate(zip(df_tr.Question, df_tr.label))):

        for k,v in replacements.items():
            txt = txt.replace(k,v)
        for k,v in regex_replacements.items():
            txt = re.sub(k,v,txt)

        doc = nlp(txt)

        toks = []
        for tok in doc:

            is_stop = tok.is_stop
            is_bad_pos = tok.pos_ in bad_pos

            if is_bad_pos or is_stop: continue

            toks.append(tok.lemma_)

        def get_n_grams(toks, n, sep = " "):
            ''' for a list of one-gram tokens, returns all n-grams '''

            out = []
            _len = len(toks)
            for i in range(_len - n - 1):
                out.append(sep.join(toks[i:i+n]))

            return out

        n_grams = toks[:]
        for n in range(2, max_n_gram+1):
            n_grams += get_n_grams(toks, n=n)

        term_freq = count(n_grams)

        # append to the dataframe
        row = pd.DataFrame()
        row["label"] = [label]
        for k in term_freq:
            row[k] = [term_freq[k]]

        X = pd.concat((X,row))
        
        # output chunks to improve runtime efficiency
        i += 1
        if i % chunk_size == 0:
            X.to_csv(f"temp-data/{i}.csv", index = False)
            X = pd.DataFrame()

    # read and merge chunks
    for i in tqdm(range(chunk_size,_len+1,chunk_size)):
        df_sub = pd.read_csv(f"temp-data/{i}.csv")
        X = pd.concat((X, df_sub))

    # erase all chunks
    run("rm -r temp-data")
    run("mkdir temp-data")
    
    X.fillna(0, inplace = True)

    y, X = X.label, X.drop("label", axis = 1)

    return X, y

process_data = True
if process_data:
    chunk_size = 1000
    X, y = pd.DataFrame(), pd.DataFrame()
    for start_idx in range(0, len(df_tr), chunk_size):
        end_idx = start_idx + chunk_size
        print(f"idx: {start_idx}-{end_idx}")
        X_sub, y_sub = preprocess(df_tr.iloc[start_idx: end_idx])
        X, y = pd.concat((X, X_sub)), pd.concat((y, y_sub))

    X.fillna(0, inplace = True)

idx: 0-1000


1000it [00:50, 19.70it/s]
100%|██████████| 20/20 [00:09<00:00,  2.20it/s]


idx: 1000-2000


1000it [00:51, 19.47it/s]
100%|██████████| 20/20 [00:09<00:00,  2.00it/s]


idx: 2000-3000


1000it [00:50, 19.67it/s]
100%|██████████| 20/20 [00:09<00:00,  2.13it/s]


idx: 3000-4000


1000it [00:55, 18.18it/s]
100%|██████████| 20/20 [00:10<00:00,  1.85it/s]


idx: 4000-5000


1000it [00:53, 18.62it/s]
100%|██████████| 20/20 [00:10<00:00,  1.86it/s]


idx: 5000-6000


1000it [00:53, 18.86it/s]
100%|██████████| 20/20 [00:11<00:00,  1.80it/s]


idx: 6000-7000


1000it [00:53, 18.54it/s]
100%|██████████| 20/20 [00:09<00:00,  2.01it/s]


idx: 7000-8000


1000it [00:54, 18.35it/s]
100%|██████████| 20/20 [00:10<00:00,  1.95it/s]


idx: 8000-9000


1000it [00:53, 18.57it/s]
100%|██████████| 20/20 [00:10<00:00,  1.94it/s]


idx: 9000-10000


1000it [00:52, 19.16it/s]
100%|██████████| 20/20 [00:10<00:00,  1.99it/s]


idx: 10000-11000


189it [00:10, 17.81it/s]
100%|██████████| 3/3 [00:00<00:00,  7.98it/s]


In [66]:
## augmentation: add data
df_aug = pd.read_csv("data-augment/out/linalg1.csv")
X_aug, y_aug = preprocess(df_aug)

258it [00:20, 12.84it/s]
100%|██████████| 5/5 [00:01<00:00,  4.81it/s]


In [None]:
## extracting the set of features that's present in the test set
test_cols = preprocess(df_tr.iloc[:partition_size])[0].columns.tolist()
test_cols = set(test_cols)

2037it [02:24, 14.06it/s]
100%|██████████| 40/40 [00:51<00:00,  1.29s/it]


In [120]:
with open("processed-data/validation_cols.pkl", "wb") as f:
    pkl.dump(test_cols, f)
test_cols = set(test_cols)

In [84]:
def standardize_len(row):
    _sum = np.sum(row.to_numpy())
    return _sum

cols = X.columns.tolist()
n_tokens = X.to_numpy().sum(axis = 1)
X = X.to_numpy().T / n_tokens
X = X.T
X = pd.DataFrame(X, columns = cols)
X["n_tokens"] = n_tokens

In [125]:
len(columns)

43020

In [251]:
columns = select_top_n(df_ig, by_class = True, n = 40000)
new_cols = []
for column in columns:
    if column in test_cols:
        new_cols.append(column)
columns = new_cols
columns += "n_tokens"

X_sub, y_sub = X, df_tr.label

print("consolidating data...")

curr_cols = set(X_sub.columns.tolist())
exp_cols = set(columns)
new_cols = list(exp_cols - curr_cols)
X_sub[new_cols] = 0

# remove label & select features
if "label" in X_sub.columns:
    X_sub.drop("label", axis = 1, inplace = True)

X_sub = X_sub[columns]

# define model class
model = LGBMClassifier(
    reg_alpha = 0.5,
    verbose = 4,
    max_depth = -1
)

_len = len(X_sub)
n_folds = 5
partition_size = _len // n_folds

ovr_metrics = {}
for start_idx in range(0, _len, partition_size):

    print("partition start...")

    # partition data into train, validation sets
    end_idx = start_idx + partition_size
    end_idx = min((end_idx,_len))
    X_v, y_v = X_sub.iloc[start_idx:end_idx].to_numpy(), y_sub.iloc[start_idx:end_idx]
    X_t = pd.concat((X_sub.iloc[:start_idx], X_sub.iloc[end_idx:]))
    y_t = pd.concat((y_sub.iloc[:start_idx], y_sub.iloc[end_idx:]))

    print("model fitting...")

    # introduce augmentation data to the train set
    if is_augment:
        X_t = pd.concat((X_t, X_aug))[columns]
        y_t = pd.concat((y_t, y_aug))

    # train the model and render predictions
    model.fit(X_t.to_numpy(), y_t)
    y_pred_t = model.predict(X_t)
    y_pred = model.predict(X_v)

    print("model scoring...")

    # score the model
    score = f1(y_v, y_pred, average = "micro")
    score_t = f1(y_t, y_pred_t, average = "micro")

    ovr_metrics["test_f1"] = score
    ovr_metrics["train_f1"] = score_t

    # get model results by class
    for cls in range(8):
        is_cls = y_v == cls
        X_cls, y_cls = X_v[is_cls], y_v[is_cls]
        y_pred_cls = model.predict(X_cls)
        ovr_metrics[f"f1_{cls}"] = f1(y_cls, y_pred_cls, average = "micro")
    break

# save the model with metrics
model.fit(X_sub.to_numpy(),y_sub)
score = round(ovr_metrics["test_f1"],3)

# identify model architecture via class membership
arch = "other"
architectures = {
    "lr" : LogisticRegression,
    "ridge" : RidgeClassifier,
    "lgbm" : LGBMClassifier
}
for k,v in architectures.items():
    if isinstance(model, v):
        arch = k

with open(f"models/{arch}-{score}.pkl", "wb") as f:
    pkl.dump(model, f)

print(ovr_metrics)

consolidating data...
partition start...
model fitting...
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.987134
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.986426
[LightGBM] [Debug] init for col-wise cost 0.136316 seconds, init for row-wise cost 0.135425 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.136631 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 12228
[LightGBM] [Info] Number of data points in the train set: 8152, number of used features: 2871
[LightGBM] [Info] Start training from score -1.340734
[LightGBM] [Info] Start training from score -1.451160
[LightGBM] [Info] Start training from score -2.270239
[LightGBM] [Info] Start training from score -3.353529
[LightGBM] [Info] Start training from score -1.792250
[LightGBM] [In

In [252]:
X_v.shape

(2037, 26483)

In [None]:
## save model predictions for analysis
df_preds = df_tr.iloc[:partition_size]
df_preds["prediction"] = y_pred
df_preds.to_csv("Test.csv")

In [131]:
feats = pd.DataFrame()
feats["feature"] = columns
feats["coef"] = np.abs(model.feature_importances_)
feats.sort_values("coef", ascending = False, inplace = True)
feats.to_csv("Feature_Importance.csv", index = False)

In [155]:
## SHAP
import matplotlib
import numpy
import shap
# shap.initjs()
explainer = shap.Explainer(model)
df_val = pd.DataFrame(X_v, columns = columns)
shap_values = explainer.shap_values(df_val)

In [256]:
ex = 97

print(df_tr.Question[ex])

def to_format(cls):

    cls_ref = {
    0 : "Algebra",
    1 : "Geometry & Trigonometry",
    2 : "Calculus & Analysis",
    3 : "Probability & Statistics",
    4 : "Number Theory",
    5 : "Combinatorics & Discrete Math",
    6 : "Linear Algebra",
    7 : "Abstract Algebra & Topology"
    }

    cls_txt = cls_ref[cls]

    return f"({cls}) {cls_txt}"

true_class = df_tr.label[ex]
pred_class = y_pred[ex]

print(f"\n[TRUE] {to_format(true_class)}")
print(f"[PRED] {to_format(pred_class)}")

feats = df_val.iloc[ex,:]
vals = shap_values[ex][:,0]
df = pd.DataFrame()
df["feat"] = feats
df[f"val{true_class} (TRUE)"] = shap_values[ex][:,true_class]
df[f"val{pred_class} (PRED)"] = shap_values[ex][:,pred_class]
df = df[df.feat != 0]
df.sort_values(f"val{pred_class} (PRED)", ascending = False, inplace = True)

df.head(15)

What is the maximum number of rational points that can lie on a circle in $\mathbb{R}^2$ whose center is not a rational point? (A \emph{rational point} is a point both of whose coordinates are rational numbers.)

[TRUE] (1) Geometry & Trigonometry
[PRED] (5) Combinatorics & Discrete Math


Unnamed: 0,feat,val1 (TRUE),val5 (PRED)
number of,0.008621,-0.000854,1.008
can,0.008621,-0.065917,0.251894
both,0.008621,-0.155074,0.220526
number,0.017241,-0.501205,0.120304
on,0.008621,0.0,0.113819
on a,0.008621,0.0,0.107064
point,0.034483,0.694302,0.100604
not,0.008621,0.006485,0.077287
maximum number of,0.008621,0.0,0.04804
a,0.034483,0.180234,0.045799


In [45]:
# load & test a pretrained model
import pickle as pkl
from lightgbm import LGBMRegressor, LGBMClassifier

max_n_gram = 3

with open("models/lgbm-0.823.pkl", "rb") as f:
    model = pkl.load(f)

with open(f"processed-data/X{max_n_gram}_1.pkl", "rb") as f:
    X_sparse = pkl.load(f)
with open(f"processed-data/cols{max_n_gram}_1.pkl", "rb") as f:
    columns = pkl.load(f)

X = pd.DataFrame.sparse.from_spmatrix(X_sparse)
X.columns = columns

df_tr = pd.read_csv("train-data/train.csv")
df_te = pd.read_csv("train-data/test.csv")

### Feature Selection via Info Gain

In [None]:
df_ig = pd.read_csv("Feature_Importance_Aug.csv")
columns = select_top_n(df_ig, by_class = True, n = 15000)
len(columns)

In [234]:
from math import log

def entropy(x):
  out = 0
  for label in [0,1]:
    p = (x == label).mean()
    if p == 0: continue
    p = p * log(p) / log(2)
    out -= p
  return out

In [242]:
df_ig = pd.DataFrame()
y = df_tr.label

# consolidating data sources
print("Consolidating data sources...")
is_aug = False
if is_aug:
    X_ig = pd.concat((X,X_aug)).fillna(0)
    y_ig = pd.concat((y,y_aug))
else:
    X_ig, y_ig = X,y

for label in range(0,8):
    print(f"label = {label}")

    # reduce to a two-class problem
    y_ = (y_ig == label).astype(int)
    entropy0 = entropy(y_)

    df = pd.DataFrame()
    scores = np.zeros(len(X_ig.columns))

    for i,col in tqdm(enumerate(X_ig.columns)):

        predictor = X_ig[col]

        y1, y2 = y_[predictor != 0], y_[predictor == 0]
        l0, l1, l2 = len(y_), len(y1), len(y2)

        IG = entropy0 - (l1/l0 * entropy(y1)) - (l2 / l0 * entropy(y2))

        scores[i] = IG

    if len(df_ig) == 0:
        df_ig["col"] = X_ig.columns

    df_ig[f"IG{label}"] = scores

Consolidating data sources...
label = 0


226504it [00:40, 5617.78it/s]


label = 1


226504it [00:37, 6089.88it/s]


label = 2


226504it [00:38, 5866.12it/s]


label = 3


226504it [00:38, 5955.77it/s]


label = 4


226504it [00:38, 5867.18it/s]


label = 5


226504it [00:40, 5642.87it/s]


label = 6


226504it [00:40, 5572.82it/s]


label = 7


226504it [00:38, 5825.36it/s]


In [243]:
def row_avg(row):
    n_classes = 8

    out = 0
    for label in range(n_classes):
        out += row[f"IG{label}"] / n_classes

    return out

df_ig["IG_ovr"] = df_ig.apply(row_avg, axis = 1)
df_ig.sort_values("IG_ovr", ascending = False)

Unnamed: 0,col,IG0,IG1,IG2,IG3,IG4,IG5,IG6,IG7,IG_ovr
745,triangle,0.034141,0.148075,0.012226,0.001353,0.021163,0.008377,0.001388,0.001193,0.028490
1690,angle,0.031275,0.137920,0.007445,0.002173,0.018466,0.016177,0.000374,0.000887,0.026840
1291,Solve,0.094478,0.021774,0.007131,0.003035,0.012243,0.014311,0.001192,0.000699,0.019358
814,area,0.018765,0.099838,0.001991,0.002609,0.015097,0.013477,0.000973,0.000836,0.019198
26,probability,0.009176,0.007896,0.003374,0.104847,0.005193,0.001481,0.000089,0.000315,0.016546
...,...,...,...,...,...,...,...,...,...,...
25595,March NUMX NUMX,0.000002,0.000006,0.000008,0.000036,0.000002,0.000005,0.000010,0.000008,0.000010
87776,c f,0.000002,0.000006,0.000008,0.000036,0.000002,0.000005,0.000010,0.000008,0.000010
50747,NUMX D b,0.000002,0.000006,0.000008,0.000036,0.000002,0.000005,0.000010,0.000008,0.000010
67245,leqslant b,0.000002,0.000006,0.000008,0.000036,0.000002,0.000005,0.000010,0.000008,0.000010


In [244]:
df_ig.to_csv("Feature_Importance_no_stop.csv", index = False)