## 🐪 Kaust Academy: Classifying Math Probems
Goal: to use my NLP background to produce a top solution by f1 for kaggle.

### Environment Setup

In [59]:
%%capture
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import spacy
from spacy import displacy
!python3 -m spacy download en_core_web_lg
from tqdm import tqdm
import re
from collections import Counter as count

from sklearn.metrics import f1_score as f1, accuracy_score as accuracy
import pickle as pkl
from sklearn.linear_model import RidgeClassifier
!pip install lightgbm
from lightgbm import LGBMRegressor, LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

### Tools

In [60]:
import subprocess

def run(command, shell = False):
  ''' run the specified command via subprocess.run '''
  if not shell: command = command.split(" ")
  process = subprocess.Popen(command, shell = shell) 
  process.wait()

### Data Load

In [62]:
df_tr = pd.read_csv("train-data/train.csv")
df_te = pd.read_csv("train-data/test.csv")

In [63]:
nlp = spacy.load("en_core_web_lg")

### Pipeline

In [64]:
## Hyperparameters
max_n_gram = 3

In [None]:

bad_pos = set(
    ["PUNCT", "SPACE"]
)

replacements = {
    "$" : "",
    "+" : " + ",
    "-" : " - ",
    "*" : " * ",
    "=" : " = ",
    "^" : " ^ ",
    "\cos" : " \cos ",
    "\sin" : " \sin ",
    r"\tan" : r" \tan ",
    "\sqrt" : " \sqrt ",
    "{" : " { ",
    "}" : " } ",
    "\\" : " ",
    ")" : " ",
    "(" : " ",
    "'" : " "
}

regex_replacements = {
    "[\d.,]+" : " NUMX "
}

def preprocess(df_tr, chunk_size = 50):
    ''' encodes the sentences in df_tr as bag-of-words tf-idf features '''

    X = pd.DataFrame()
    _len = len(df_tr)

    for i,(txt,label) in tqdm(enumerate(zip(df_tr.Question, df_tr.label))):

        for k,v in replacements.items():
            txt = txt.replace(k,v)
        for k,v in regex_replacements.items():
            txt = re.sub(k,v,txt)

        doc = nlp(txt)

        toks = []
        for tok in doc:

            is_stop = tok.is_stop
            is_bad_pos = tok.pos_ in bad_pos

            if is_bad_pos: continue

            toks.append(tok.lemma_)

        def get_n_grams(toks, n, sep = " "):
            ''' for a list of one-gram tokens, returns all n-grams '''

            out = []
            _len = len(toks)
            for i in range(_len - n - 1):
                out.append(sep.join(toks[i:i+n]))

            return out

        n_grams = toks[:]
        for n in range(2, max_n_gram+1):
            n_grams += get_n_grams(toks, n=n)

        term_freq = count(n_grams)

        # append to the dataframe
        row = pd.DataFrame()
        row["label"] = [label]
        for k in term_freq:
            row[k] = [term_freq[k]]

        X = pd.concat((X,row))
        
        # output chunks to improve runtime efficiency
        i += 1
        if i % chunk_size == 0:
            X.to_csv(f"temp-data/{i}.csv", index = False)
            X = pd.DataFrame()

    # read and merge chunks
    for i in tqdm(range(chunk_size,_len+1,chunk_size)):
        df_sub = pd.read_csv(f"temp-data/{i}.csv")
        X = pd.concat((X, df_sub))

    # erase all chunks
    run("rm -r temp-data")
    run("mkdir temp-data")
    
    X.fillna(0, inplace = True)

    y, X = X.label, X.drop("label", axis = 1)

    return X, y

process_data = False
if process_data:
    chunk_size = 1000
    X, y = pd.DataFrame(), pd.DataFrame()
    for start_idx in range(0, 10000, chunk_size):
        end_idx = start_idx + chunk_size
        print(f"idx: {start_idx}-{end_idx}")
        X_sub, y_sub = preprocess(df_tr.iloc[start_idx: end_idx])
        X, y = pd.concat((X, X_sub)), pd.concat((y, y_sub))

    X.fillna(0, inplace = True)

In [80]:
# save data as a sparse matrix

save_data = False
if save_data:   
    from scipy.sparse import csr_matrix
    X_sparse  = csr_matrix(X_v.values)
    with open(f"processed-data/X{max_n_gram}.pkl", "wb") as f:
        pkl.dump(X_sparse, f)
    with open(f"processed-data/cols{max_n_gram}.pkl", "wb") as f:
        pkl.dump(X.columns, f)

load_data = False
if load_data:
    with open(f"processed-data/X{max_n_gram}_1.pkl", "rb") as f:
        X_sparse = pkl.load(f)
    with open(f"processed-data/cols{max_n_gram}_1.pkl", "rb") as f:
        columns = pkl.load(f)

    X = pd.DataFrame.sparse.from_spmatrix(X_sparse)
    X.columns = columns

load_semi_supervised = False
if load_semi_supervised:
    with open(f"processed-data/X{max_n_gram}_v.pkl", "rb") as f:
        X_sparse = pkl.load(f)
    with open(f"processed-data/cols{max_n_gram}_v.pkl", "rb") as f:
        columns = pkl.load(f)

    X_v = pd.DataFrame.sparse.from_spmatrix(X_sparse)
    X_v.columns = columns

In [None]:
X_sub, y_sub = X, df_tr.label

# remove label & select features
if "label" in X_sub.columns:
    X_sub.drop("label", axis = 1, inplace = True)

columns = select_top_n(df_ig, by_class = True, n = 15000)
X_sub = X_sub[columns]

# define model class
model = LogisticRegression(
    C = 0.6
)

_len = len(X_sub)
n_folds = 5
partition_size = _len // n_folds

ovr_metrics = {}
for start_idx in range(0, _len, partition_size):

    # partition data into train, validation sets
    end_idx = start_idx + partition_size
    end_idx = min((end_idx,_len))
    X_v, y_v = X_sub.iloc[start_idx:end_idx].to_numpy(), y_sub.iloc[start_idx:end_idx]
    X_t = pd.concat((X_sub.iloc[:start_idx], X_sub.iloc[end_idx:])).to_numpy()
    y_t = pd.concat((y_sub.iloc[:start_idx], y_sub.iloc[end_idx:]))

    # train the model and render predictions
    model.fit(X_t, y_t)
    y_pred_t = model.predict(X_t)
    y_pred = model.predict(X_v)

    # score the model
    score = f1(y_v, y_pred, average = "micro")
    score_t = f1(y_t, y_pred_t, average = "micro")

    ovr_metrics["test_f1"] = score
    ovr_metrics["train_f1"] = score_t

    break

# save the model with metrics
model.fit(X_sub.to_numpy(),y_sub)
score = round(ovr_metrics["test_f1"],3)

# identify model architecture via class membership
arch = "other"
architectures = {
    "lr" : LogisticRegression,
    "ridge" : RidgeClassifier,
    "lgbm" : LGBMClassifier
}
for k,v in architectures.items():
    if isinstance(model, v):
        arch = k

with open(f"models/{arch}-{score}.pkl", "wb") as f:
    pkl.dump(model, f)

print(ovr_metrics)

{'test_f1': 0.7893961708394698, 'train_f1': 0.9919038272816487}


In [None]:
feats = pd.DataFrame()
feats["feature"] = columns
feats["coef"] = np.abs(model.coef_).mean(axis = 0)
feats.sort_values("coef", ascending = False, inplace = True)
feats.to_csv("Feature_Importance.csv", index = False)

### Feature Selection via Info Gain

In [81]:
from math import log

def entropy(x):
  out = 0
  for label in [0,1]:
    p = (x == label).mean()
    if p == 0: continue
    p = p * log(p) / log(2)
    out -= p
  return out

In [None]:
df_ig = pd.DataFrame()
y = df_tr.label

for label in range(0,8):
    print(f"label = {label}")

    # reduce to a two-class problem
    y_ = (y == label).astype(int)
    entropy0 = entropy(y_)

    df = pd.DataFrame()
    scores = np.zeros(len(X.columns))

    for i,col in tqdm(enumerate(X.columns)):

        predictor = X[col]

        y1, y2 = y_[predictor != 0], y_[predictor == 0]
        l0, l1, l2 = len(y_), len(y1), len(y2)

        IG = entropy0 - (l1/l0 * entropy(y1)) - (l2 / l0 * entropy(y2))

        scores[i] = IG

    if len(df_ig) == 0:
        df_ig["col"] = X.columns

    df_ig[f"IG{label}"] = scores

label = 0


281614it [01:23, 3371.27it/s]


         col       IG0
0          a  0.020800
1  solitaire  0.000042
2       game  0.002517
3         be  0.027708
4       play  0.001907
label = 1


281614it [01:24, 3337.22it/s]


         col       IG0       IG1
0          a  0.020800  0.025816
1  solitaire  0.000042  0.000039
2       game  0.002517  0.003644
3         be  0.027708  0.019386
4       play  0.001907  0.003087
label = 2


281614it [01:24, 3342.58it/s]


         col       IG0       IG1       IG2
0          a  0.020800  0.025816  0.008881
1  solitaire  0.000042  0.000039  0.000015
2       game  0.002517  0.003644  0.001185
3         be  0.027708  0.019386  0.020336
4       play  0.001907  0.003087  0.001639
label = 3


281614it [01:25, 3276.76it/s]


         col       IG0       IG1       IG2       IG3
0          a  0.020800  0.025816  0.008881  0.005547
1  solitaire  0.000042  0.000039  0.000015  0.000470
2       game  0.002517  0.003644  0.001185  0.003645
3         be  0.027708  0.019386  0.020336  0.007355
4       play  0.001907  0.003087  0.001639  0.002104
label = 4


281614it [01:23, 3362.33it/s]


         col       IG0       IG1       IG2       IG3       IG4
0          a  0.020800  0.025816  0.008881  0.005547  0.004205
1  solitaire  0.000042  0.000039  0.000015  0.000470  0.000026
2       game  0.002517  0.003644  0.001185  0.003645  0.001608
3         be  0.027708  0.019386  0.020336  0.007355  0.000287
4       play  0.001907  0.003087  0.001639  0.002104  0.001068
label = 5


281614it [01:23, 3357.41it/s]


         col       IG0       IG1       IG2       IG3       IG4       IG5
0          a  0.020800  0.025816  0.008881  0.005547  0.004205  0.011488
1  solitaire  0.000042  0.000039  0.000015  0.000470  0.000026  0.000028
2       game  0.002517  0.003644  0.001185  0.003645  0.001608  0.008066
3         be  0.027708  0.019386  0.020336  0.007355  0.000287  0.014090
4       play  0.001907  0.003087  0.001639  0.002104  0.001068  0.009715
label = 6


281614it [01:22, 3411.77it/s]


         col       IG0       IG1       IG2       IG3       IG4       IG5  \
0          a  0.020800  0.025816  0.008881  0.005547  0.004205  0.011488   
1  solitaire  0.000042  0.000039  0.000015  0.000470  0.000026  0.000028   
2       game  0.002517  0.003644  0.001185  0.003645  0.001608  0.008066   
3         be  0.027708  0.019386  0.020336  0.007355  0.000287  0.014090   
4       play  0.001907  0.003087  0.001639  0.002104  0.001068  0.009715   

        IG6  
0  0.000252  
1  0.000001  
2  0.000154  
3  0.000968  
4  0.000150  
label = 7


281614it [01:21, 3437.80it/s]

         col       IG0       IG1       IG2       IG3       IG4       IG5  \
0          a  0.020800  0.025816  0.008881  0.005547  0.004205  0.011488   
1  solitaire  0.000042  0.000039  0.000015  0.000470  0.000026  0.000028   
2       game  0.002517  0.003644  0.001185  0.003645  0.001608  0.008066   
3         be  0.027708  0.019386  0.020336  0.007355  0.000287  0.014090   
4       play  0.001907  0.003087  0.001639  0.002104  0.001068  0.009715   

        IG6           IG7  
0  0.000252  1.665554e-05  
1  0.000001  1.200247e-06  
2  0.000154  9.061636e-04  
3  0.000968  9.412353e-04  
4  0.000150  7.247519e-07  





In [109]:
def select_top_n(df_ig, by_class = True, n = 500):
    ''' 
    returns a column set of:
    - the top "n" columns by IG_ovr if by_class == False
    - the top "n" columns by IG{n} for n on [0,8) if by_class == True
    '''

    columns = set()

    if by_class:
        for label in range(8):

            df_ig.sort_values(f"IG{label}", ascending = False, inplace = True)
            columns.update(set(df_ig.col[:n].tolist()))

    else:
        df_ig.sort_values(f"IG_ovr", ascending = False, inplace = True)
        columns.update(set(df_ig.col[:n].tolist()))

    return list(columns)

columns = select_top_n(df_ig, by_class = False, n = 100)

In [92]:
def row_avg(row):
    n_classes = 8

    out = 0
    for label in range(n_classes):
        out += row[f"IG{label}"] / n_classes

    return out

df_ig["IG_ovr"] = df_ig.apply(row_avg, axis = 1)
df_ig.sort_values("IG_ovr", ascending = False)

Unnamed: 0,col,IG0,IG1,IG2,IG3,IG4,IG5,IG6,IG7,IG_ovr
1229,triangle,0.034141,0.148075,0.012226,0.001353,0.021163,0.008377,0.001388,0.001193,0.028490
2543,angle,0.031321,0.138155,0.007460,0.002178,0.018494,0.016205,0.000375,0.000889,0.026885
1341,area,0.018765,0.099838,0.001991,0.002609,0.015097,0.013477,0.000973,0.000836,0.019198
1960,',0.086671,0.017862,0.006425,0.001401,0.013064,0.012129,0.001276,0.000738,0.017446
1235,side,0.023776,0.085014,0.009678,0.000005,0.013093,0.001987,0.001143,0.000982,0.016960
...,...,...,...,...,...,...,...,...,...,...
65139,final solution,0.000006,0.000003,0.000030,0.000026,0.000002,0.000001,0.000007,0.000006,0.000010
35564,NUMX March,0.000002,0.000006,0.000008,0.000036,0.000002,0.000005,0.000010,0.000008,0.000010
3491,of solution to,0.000002,0.000006,0.000008,0.000036,0.000002,0.000005,0.000010,0.000008,0.000010
35598,March NUMX NUMX,0.000002,0.000006,0.000008,0.000036,0.000002,0.000005,0.000010,0.000008,0.000010


In [94]:
df_ig.to_csv("Feature_Importance.csv", index = False)