## 🐪 Kaust Academy: Classifying Math Probems
Goal: to use my NLP background to produce a top solution by f1 for kaggle.

### Environment Setup

In [2]:
%%capture

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import spacy
from spacy import displacy
!python3 -m spacy download en_core_web_lg
from tqdm import tqdm
import re
from collections import Counter as count

from sklearn.metrics import f1_score as f1, accuracy_score as accuracy
import pickle as pkl
from sklearn.linear_model import RidgeClassifier
!pip install lightgbm
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings("ignore")

### Tools

In [3]:
import subprocess

def run(command, shell = False):
  ''' run the specified command via subprocess.run '''
  if not shell: command = command.split(" ")
  process = subprocess.Popen(command, shell = shell) 
  process.wait()

### Data Load

In [4]:
df_tr = pd.read_csv("train-data/train.csv")
df_te = pd.read_csv("train-data/test.csv")

In [5]:
nlp = spacy.load("en_core_web_lg")

### Pipeline

In [30]:
## Hyperparameters
max_n_gram = 1

In [7]:

bad_pos = set(
    ["PUNCT", "SPACE"]
)

replacements = {
    "$" : "",
    "+" : " + ",
    "-" : " - ",
    "*" : " * ",
    "=" : " = ",
    "^" : " ^ ",
    "\cos" : " \cos ",
    "\sin" : " \sin ",
    r"\tan" : r" \tan ",
    "\sqrt" : " \sqrt ",
    "{" : " { ",
    "}" : " } ",
    "\\" : " ",
    ")" : " ",
    "(" : " "
}

regex_replacements = {
    "[\d.,]+" : " NUMX "
}

def preprocess(df_tr, chunk_size = 20):
    ''' encodes the sentences in df_tr as bag-of-words tf-idf features '''

    X = pd.DataFrame()
    _len = len(df_tr)

    for i,(txt,label) in tqdm(enumerate(zip(df_tr.Question, df_tr.label))):

        for k,v in replacements.items():
            txt = txt.replace(k,v)
        for k,v in regex_replacements.items():
            txt = re.sub(k,v,txt)

        doc = nlp(txt)

        toks = []
        for tok in doc:

            is_stop = tok.is_stop
            is_bad_pos = tok.pos_ in bad_pos

            if is_bad_pos: continue

            toks.append(tok.lemma_)

        def get_n_grams(toks, n, sep = " "):
            ''' for a list of one-gram tokens, returns all n-grams '''

            out = []
            _len = len(toks)
            for i in range(_len - n - 1):
                out.append(sep.join(toks[i:i+n]))

            return out

        n_grams = toks[:]
        for n in range(2, max_n_gram+1):
            n_grams += get_n_grams(toks, n=n)

        term_freq = count(n_grams)

        # append to the dataframe
        row = pd.DataFrame()
        row["label"] = [label]
        for k in term_freq:
            row[k] = [term_freq[k]]

        X = pd.concat((row,X))
        
        # output chunks to improve runtime efficiency
        i += 1
        if i % chunk_size == 0:
            X.to_csv(f"temp-data/{i}.csv", index = False)
            X = pd.DataFrame()

    # read and merge chunks
    for i in tqdm(range(chunk_size,_len+1,chunk_size)):
        df_sub = pd.read_csv(f"temp-data/{i}.csv")
        X = pd.concat((X, df_sub))

    # erase all chunks
    run("rm -r temp-data")
    run("mkdir temp-data")
    
    X.fillna(0, inplace = True)

    y, X = X.label, X.drop("label", axis = 1)

    return X, y

process_data = False
if process_data:
    chunk_size = 500
    X, y = pd.DataFrame(), pd.DataFrame()
    for start_idx in range(0, 10000, chunk_size):
        end_idx = start_idx + chunk_size
        print(f"idx: {start_idx}-{end_idx}")
        X_sub, y_sub = preprocess(df_tr.iloc[start_idx: end_idx])
        X, y = pd.concat((X, X_sub)), pd.concat((y, y_sub))

    X.fillna(0, inplace = True)

In [12]:
# save data as a sparse matrix
max_n_gram = 3

save_data = False
if save_data:   
    from scipy.sparse import csr_matrix
    X_sparse  = csr_matrix(X.values)
    with open(f"processed-data/X{max_n_gram}.pkl", "wb") as f:
        pkl.dump(X_sparse, f)
    with open(f"processed-data/cols{max_n_gram}.pkl", "wb") as f:
        pkl.dump(X.columns, f)

load_data = True
if load_data:
    with open(f"processed-data/X{max_n_gram}.pkl", "rb") as f:
        X_sparse = pkl.load(f)
    with open(f"processed-data/cols{max_n_gram}.pkl", "rb") as f:
        columns = pkl.load(f)

    X = pd.DataFrame.sparse.from_spmatrix(X_sparse)
    X.columns = columns

In [15]:
X.shape

(10000, 277113)

In [None]:

minibatch = 10000
X_sub, y_sub = X.iloc[:minibatch], df_tr.label.iloc[:minibatch]

model = LogisticRegression()

_len = len(X_sub)
n_folds = 5
partition_size = _len // n_folds

ovr_metrics = {}
metrics = ["acc", "f1"]
for metric in metrics:
    ovr_metrics[metric] = 0

for start_idx in range(0, _len, partition_size):

    # partition data into train, validation sets
    end_idx = start_idx + partition_size
    end_idx = min((end_idx,_len))
    X_v, y_v = X_sub.iloc[start_idx:end_idx], y_sub.iloc[start_idx:end_idx]
    X_t = pd.concat((X_sub.iloc[:start_idx], X_sub.iloc[end_idx:]))
    y_t = pd.concat((y_sub.iloc[:start_idx], y_sub.iloc[end_idx:]))

    # train the model and render predictions
    model.fit(X_t, y_t)
    y_pred_t = model.predict(X_t)
    y_pred = model.predict(X_v)

    # score the model
    score = f1(y_v, y_pred, average = "micro")
    acc = accuracy(y_v, y_pred)

    score_t = f1(y_t, y_pred_t, average = "micro")

    ovr_metrics["f1"] = score
    ovr_metrics["acc"] = acc
    ovr_metrics["train_f1"] = score_t

    break

# save the model with metrics
model.fit(X_sub,y_sub)
acc, score = round(ovr_metrics["acc"],3), round(ovr_metrics["f1"],3)

# identify model architecture via class membership
arch = "other"
architectures = {
    "lr" : LogisticRegression,
    "ridge" : RidgeClassifier,
    "lgb" : LGBMClassifier
}
for k,v in architectures.items():
    if isinstance(model, v):
        arch = k

with open(f"models/{arch}-{score}.pkl", "wb") as f:
    pkl.dump(model, f)

print(ovr_metrics)

In [13]:
## Training

minibatch = 100
X_sub, y_sub = X, df_tr.label[:10000]

model = LogisticRegression()

n_folds = 5
_len = len(X_sub)
partition_size = _len // n_folds

ovr_metrics = {}
metrics = ["acc", "f1"]
for metric in metrics:
    ovr_metrics[metric] = 0

for start_idx in range(0, _len, partition_size):

    # partition data into train, validation sets
    end_idx = start_idx + partition_size
    end_idx = min((end_idx,_len))
    X_v, y_v = X_sub.iloc[start_idx:end_idx], y_sub.iloc[start_idx:end_idx]
    X_t = pd.concat((X_sub.iloc[:start_idx], X_sub.iloc[end_idx:]))
    y_t = pd.concat((y_sub.iloc[:start_idx], y_sub.iloc[end_idx:]))

    # train the model and render predictions
    model.fit(X_t, y_t)
    y_pred = model.predict(X_v)

    # score the model
    score = f1(y_v, y_pred, average = "micro")
    acc = accuracy(y_v, y_pred)

    print(score)
    
    ovr_metrics["f1"] += score / n_folds
    ovr_metrics["acc"] += acc / n_folds

# save the model with metrics
model.fit(X_sub,y_sub)
acc, score = round(ovr_metrics["acc"],3), round(ovr_metrics["f1"],3)

# identify model architecture via class membership
arch = "other"
architectures = {
    "lr" : LogisticRegression,
    "ridge" : RidgeClassifier
}
for k,v in architectures.items():
    if isinstance(model, v):
        arch = k

with open(f"models/{arch}-f1={score}+acc={acc}", "wb") as f:
    pkl.dump(model, f)

print(ovr_metrics)

KeyboardInterrupt: 