## 🐪 Kaust Academy: Classifying Math Probems
Goal: to use my NLP background to produce a top solution by f1 for kaggle.

In [13]:
## hyperparameters
max_n_gram = 3

### Environment Setup

In [3]:
%%capture
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import spacy
from spacy import displacy
from tqdm import tqdm
import re
from collections import Counter as count

from sklearn.metrics import f1_score as f1, accuracy_score as accuracy
import pickle as pkl
from sklearn.linear_model import RidgeClassifier
from lightgbm import LGBMRegressor, LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

### Tools

In [4]:
import subprocess

def run(command, shell = False):
  ''' run the specified command via subprocess.run '''
  if not shell: command = command.split(" ")
  process = subprocess.Popen(command, shell = shell) 
  process.wait()

### Data Load

In [5]:
df_tr = pd.read_csv("train-data/train.csv")
df_te = pd.read_csv("train-data/test.csv")

In [6]:
nlp = spacy.load("en_core_web_lg")

In [16]:
# save data as a sparse matrix

save_data = False
if save_data:   
    from scipy.sparse import csr_matrix
    X_sparse  = csr_matrix(X_v.values)
    with open(f"processed-data/X{max_n_gram}.pkl", "wb") as f:
        pkl.dump(X_sparse, f)
    with open(f"processed-data/cols{max_n_gram}.pkl", "wb") as f:
        pkl.dump(X.columns, f)

load_data = True
if load_data:
    with open(f"processed-data/X{max_n_gram}_1.pkl", "rb") as f:
        X_sparse = pkl.load(f)
    with open(f"processed-data/cols{max_n_gram}_1.pkl", "rb") as f:
        columns = pkl.load(f)

    X = pd.DataFrame.sparse.from_spmatrix(X_sparse)
    X.columns = columns

load_semi_supervised = False
if load_semi_supervised:
    with open(f"processed-data/X{max_n_gram}_v.pkl", "rb") as f:
        X_sparse = pkl.load(f)
    with open(f"processed-data/cols{max_n_gram}_v.pkl", "rb") as f:
        columns = pkl.load(f)

    X_v = pd.DataFrame.sparse.from_spmatrix(X_sparse)
    X_v.columns = columns

df_ig = pd.read_csv("Feature_Importance_Aug.csv")

### Definitions

In [8]:
def select_top_n(df_ig, by_class = True, n = 500):
    ''' 
    returns a column set of:
    - the top "n" columns by IG_ovr if by_class == False
    - the top "n" columns by IG{n} for n on [0,8) if by_class == True
    '''

    columns = set()

    if by_class:
        for label in range(8):

            df_ig.sort_values(f"IG{label}", ascending = False, inplace = True)
            columns.update(set(df_ig.col[:n].tolist()))

    else:
        df_ig.sort_values(f"IG_ovr", ascending = False, inplace = True)
        columns.update(set(df_ig.col[:n].tolist()))

    return list(columns)

### Pipeline

In [9]:
## Hyperparameters
max_n_gram = 3

In [10]:

bad_pos = set(
    ["PUNCT", "SPACE"]
)

replacements = {
    "$" : "",
    "+" : " + ",
    "-" : " - ",
    "*" : " * ",
    "=" : " = ",
    "^" : " ^ ",
    "\cos" : " \cos ",
    "\sin" : " \sin ",
    r"\tan" : r" \tan ",
    "\sqrt" : " \sqrt ",
    "{" : " { ",
    "}" : " } ",
    "\\" : " ",
    ")" : " ",
    "(" : " ",
    "'" : " "
}

regex_replacements = {
    "[\d.,]+" : " NUMX "
}

def preprocess(df_tr, chunk_size = 50):
    ''' encodes the sentences in df_tr as bag-of-words tf-idf features '''

    X = pd.DataFrame()
    _len = len(df_tr)

    for i,(txt,label) in tqdm(enumerate(zip(df_tr.Question, df_tr.label))):

        for k,v in replacements.items():
            txt = txt.replace(k,v)
        for k,v in regex_replacements.items():
            txt = re.sub(k,v,txt)

        doc = nlp(txt)

        toks = []
        for tok in doc:

            is_stop = tok.is_stop
            is_bad_pos = tok.pos_ in bad_pos

            if is_bad_pos: continue

            toks.append(tok.lemma_)

        def get_n_grams(toks, n, sep = " "):
            ''' for a list of one-gram tokens, returns all n-grams '''

            out = []
            _len = len(toks)
            for i in range(_len - n - 1):
                out.append(sep.join(toks[i:i+n]))

            return out

        n_grams = toks[:]
        for n in range(2, max_n_gram+1):
            n_grams += get_n_grams(toks, n=n)

        term_freq = count(n_grams)

        # append to the dataframe
        row = pd.DataFrame()
        row["label"] = [label]
        for k in term_freq:
            row[k] = [term_freq[k]]

        X = pd.concat((X,row))
        
        # output chunks to improve runtime efficiency
        i += 1
        if i % chunk_size == 0:
            X.to_csv(f"temp-data/{i}.csv", index = False)
            X = pd.DataFrame()

    # read and merge chunks
    for i in tqdm(range(chunk_size,_len+1,chunk_size)):
        df_sub = pd.read_csv(f"temp-data/{i}.csv")
        X = pd.concat((X, df_sub))

    # erase all chunks
    run("rm -r temp-data")
    run("mkdir temp-data")
    
    X.fillna(0, inplace = True)

    y, X = X.label, X.drop("label", axis = 1)

    return X, y

process_data = False
if process_data:
    chunk_size = 1000
    X, y = pd.DataFrame(), pd.DataFrame()
    for start_idx in range(0, 10000, chunk_size):
        end_idx = start_idx + chunk_size
        print(f"idx: {start_idx}-{end_idx}")
        X_sub, y_sub = preprocess(df_tr.iloc[start_idx: end_idx])
        X, y = pd.concat((X, X_sub)), pd.concat((y, y_sub))

    X.fillna(0, inplace = True)

In [11]:
## augmentation: add data
df_aug = pd.read_csv("data-augment/out/linalg1.csv")
X_aug, y_aug = preprocess(df_aug)

258it [00:22, 11.37it/s]
100%|██████████| 5/5 [00:01<00:00,  2.70it/s]


In [37]:
columns = select_top_n(df_ig, by_class = True, n = 15000)

X_sub, y_sub = X, df_tr.label

curr_cols = set(X_sub.columns.tolist())
exp_cols = set(columns)
new_cols = list(exp_cols - curr_cols)
X_sub[new_cols] = 0

# remove label & select features
if "label" in X_sub.columns:
    X_sub.drop("label", axis = 1, inplace = True)

X_sub = X_sub[columns]

# define model class
model = LGBMClassifier(
    reg_alpha = 0.5,
    verbose = 0
)

_len = len(X_sub)
n_folds = 5
partition_size = _len // n_folds

ovr_metrics = {}
for start_idx in range(0, _len, partition_size):

    # partition data into train, validation sets
    end_idx = start_idx + partition_size
    end_idx = min((end_idx,_len))
    X_v, y_v = X_sub.iloc[start_idx:end_idx].to_numpy(), y_sub.iloc[start_idx:end_idx]
    X_t = pd.concat((X_sub.iloc[:start_idx], X_sub.iloc[end_idx:]))
    y_t = pd.concat((y_sub.iloc[:start_idx], y_sub.iloc[end_idx:]))

    # introduce augmentation data to the train set
    X_t = pd.concat((X_t, X_aug))[columns].to_numpy()
    y_t = pd.concat((y_t, y_aug))

    # train the model and render predictions
    model.fit(X_t, y_t)
    y_pred_t = model.predict(X_t)
    y_pred = model.predict(X_v)

    # score the model
    score = f1(y_v, y_pred, average = "micro")
    score_t = f1(y_t, y_pred_t, average = "micro")

    ovr_metrics["test_f1"] = score
    ovr_metrics["train_f1"] = score_t

    # get model results by class
    for cls in range(8):
        is_cls = y_v == cls
        X_cls, y_cls = X_v[is_cls], y_v[is_cls]
        y_pred_cls = model.predict(X_cls)
        ovr_metrics[f"f1_{cls}"] = f1(y_cls, y_pred_cls, average = "micro")
    break

# save the model with metrics
model.fit(X_sub.to_numpy(),y_sub)
score = round(ovr_metrics["test_f1"],3)

# identify model architecture via class membership
arch = "other"
architectures = {
    "lr" : LogisticRegression,
    "ridge" : RidgeClassifier,
    "lgbm" : LGBMClassifier
}
for k,v in architectures.items():
    if isinstance(model, v):
        arch = k

with open(f"models/{arch}-{score}.pkl", "wb") as f:
    pkl.dump(model, f)

print(ovr_metrics)

Resizing X_sub...
{'test_f1': 0.8227785959744722, 'train_f1': 0.9877526753864447, 'f1_0': 0.865979381443299, 'f1_1': 0.9206049149338374, 'f1_2': 0.6751269035532995, 'f1_3': 0.7710843373493976, 'f1_4': 0.7768361581920904, 'f1_5': 0.7790368271954674, 'f1_6': 0.42857142857142855, 'f1_7': 0.7272727272727273}


In [None]:
feats = pd.DataFrame()
feats["feature"] = columns
feats["coef"] = np.abs(model.coef_).mean(axis = 0)
feats.sort_values("coef", ascending = False, inplace = True)
feats.to_csv("Feature_Importance.csv", index = False)

### Feature Selection via Info Gain

In [81]:
from math import log

def entropy(x):
  out = 0
  for label in [0,1]:
    p = (x == label).mean()
    if p == 0: continue
    p = p * log(p) / log(2)
    out -= p
  return out

In [138]:
df_ig = pd.DataFrame()
y = df_tr.label

# consolidating data sources
print("Consolidating data sources...")
X_ig = pd.concat((X,X_aug)).fillna(0)
y_ig = pd.concat((y,y_aug))

for label in range(0,8):
    print(f"label = {label}")

    # reduce to a two-class problem
    y_ = (y_ig == label).astype(int)
    entropy0 = entropy(y_)

    df = pd.DataFrame()
    scores = np.zeros(len(X_ig.columns))

    for i,col in tqdm(enumerate(X_ig.columns)):

        predictor = X_ig[col]

        y1, y2 = y_[predictor != 0], y_[predictor == 0]
        l0, l1, l2 = len(y_), len(y1), len(y2)

        IG = entropy0 - (l1/l0 * entropy(y1)) - (l2 / l0 * entropy(y2))

        scores[i] = IG

    if len(df_ig) == 0:
        df_ig["col"] = X_ig.columns

    df_ig[f"IG{label}"] = scores

Consolidating data sources...
label = 0


295964it [01:29, 3310.30it/s]


label = 1


295964it [01:26, 3433.85it/s]


label = 2


295964it [01:25, 3465.05it/s]


label = 3


295964it [01:25, 3448.60it/s]


label = 4


295964it [01:25, 3471.00it/s]


label = 5


295964it [01:26, 3438.49it/s]


label = 6


295964it [01:25, 3461.57it/s]


label = 7


295964it [01:25, 3477.17it/s]


In [140]:
def row_avg(row):
    n_classes = 8

    out = 0
    for label in range(n_classes):
        out += row[f"IG{label}"] / n_classes

    return out

df_ig["IG_ovr"] = df_ig.apply(row_avg, axis = 1)
df_ig.sort_values("IG_ovr", ascending = False)

Unnamed: 0,col,IG0,IG1,IG2,IG3,IG4,IG5,IG6,IG7,IG_ovr
1229,triangle,0.032121,0.147341,0.011540,0.001238,0.019954,0.007669,0.004771,0.001133,0.028221
2543,angle,0.029678,0.136704,0.007025,0.002046,0.017537,0.015294,0.002233,0.000846,0.026420
1341,area,0.017595,0.099129,0.001803,0.002465,0.014274,0.012685,0.002762,0.000795,0.018939
1960,',0.086399,0.016801,0.006045,0.001305,0.012325,0.011401,0.000022,0.000702,0.016875
1235,side,0.022323,0.084777,0.009144,0.000001,0.012285,0.001720,0.003323,0.000934,0.016814
...,...,...,...,...,...,...,...,...,...,...
143466,n end,0.000005,0.000002,0.000031,0.000025,0.000003,0.000001,0.000024,0.000006,0.000012
107325,c satisfy the,0.000005,0.000002,0.000031,0.000025,0.000003,0.000001,0.000024,0.000006,0.000012
44467,q NUMX a,0.000005,0.000002,0.000031,0.000025,0.000003,0.000001,0.000024,0.000006,0.000012
133637,a closed,0.000005,0.000002,0.000031,0.000025,0.000003,0.000001,0.000024,0.000006,0.000012


In [141]:
df_ig.to_csv("Feature_Importance_Aug.csv", index = False)