## 🐪 Kaust Academy: Classifying Math Probems
Goal: to use my NLP background to produce a top solution by f1 for kaggle.

In [1]:
## hyperparameters
max_n_gram = 3
is_augment = False

### Environment Setup

In [41]:
%%capture
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import spacy
from spacy import displacy
from tqdm import tqdm
import re
from collections import Counter as count

from sklearn.metrics import f1_score as f1, accuracy_score as accuracy
import pickle as pkl
from sklearn.linear_model import RidgeClassifier
from lightgbm import LGBMRegressor, LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

### Tools

In [3]:
import subprocess

def run(command, shell = False):
  ''' run the specified command via subprocess.run '''
  if not shell: command = command.split(" ")
  process = subprocess.Popen(command, shell = shell) 
  process.wait()

### Data Load

In [8]:
df_tr = pd.read_csv("train-data/train.csv")
df_te = pd.read_csv("train-data/test.csv")

In [5]:
nlp = spacy.load("en_core_web_lg")

In [None]:
# save data as a sparse matrix

save_data = False
if save_data:   
    from scipy.sparse import csr_matrix
    X_sparse  = csr_matrix(X.values)
    with open(f"processed-data/X{max_n_gram}_no_stop.pkl", "wb") as f:
        pkl.dump(X_sparse, f)
    with open(f"processed-data/cols{max_n_gram}_no_stop.pkl", "wb") as f:
        pkl.dump(X.columns, f)

load_data = False
if load_data:
    with open(f"processed-data/X{max_n_gram}_1.pkl", "rb") as f:
        X_sparse = pkl.load(f)
    with open(f"processed-data/cols{max_n_gram}_1.pkl", "rb") as f:
        columns = pkl.load(f)

    X = pd.DataFrame.sparse.from_spmatrix(X_sparse)
    X.columns = columns

load_semi_supervised = False
if load_semi_supervised:
    with open(f"processed-data/X{max_n_gram}_v.pkl", "rb") as f:
        X_sparse = pkl.load(f)
    with open(f"processed-data/cols{max_n_gram}_v.pkl", "rb") as f:
        columns = pkl.load(f)

    X_v = pd.DataFrame.sparse.from_spmatrix(X_sparse)
    X_v.columns = columns

df_ig = pd.read_csv("Feature_Importance_Aug.csv")

NameError: name 'X' is not defined

### Definitions

In [7]:
def select_top_n(df_ig, by_class = True, n = 500):
    ''' 
    returns a column set of:
    - the top "n" columns by IG_ovr if by_class == False
    - the top "n" columns by IG{n} for n on [0,8) if by_class == True
    '''

    columns = set()

    if by_class:
        for label in range(8):

            df_ig.sort_values(f"IG{label}", ascending = False, inplace = True)
            columns.update(set(df_ig.col[:n].tolist()))

    else:
        df_ig.sort_values(f"IG_ovr", ascending = False, inplace = True)
        columns.update(set(df_ig.col[:n].tolist()))

    return list(columns)

### Preprocessing

In [None]:
df_te["label"] = -1
df_combined = pd.concat((df_tr, df_te))
df_combined.drop("id", axis = 1, inplace = True)

In [18]:

bad_pos = set(
    ["PUNCT", "SPACE"]
)

replacements = {
    "$" : "",
    "+" : " + ",
    "-" : " - ",
    "*" : " * ",
    "=" : " = ",
    "^" : " ^ ",
    "\cos" : " \cos ",
    "\sin" : " \sin ",
    r"\tan" : r" \tan ",
    "\sqrt" : " \sqrt ",
    "{" : " { ",
    "}" : " } ",
    "\\" : " ",
    ")" : " ",
    "(" : " ",
    "'" : " ",
    "]" : " ",
    "[" : " "
}

regex_replacements = {
    "[\d.,]+" : " NUMX "
}

def preprocess(df_tr, chunk_size = 50):
    ''' encodes the sentences in df_tr as bag-of-words tf-idf features '''

    X = pd.DataFrame()
    _len = len(df_tr)

    for i,(txt,label) in tqdm(enumerate(zip(df_tr.Question, df_tr.label))):

        for k,v in replacements.items():
            txt = txt.replace(k,v)
        for k,v in regex_replacements.items():
            txt = re.sub(k,v,txt)

        doc = nlp(txt)

        toks = []
        for tok in doc:

            is_stop = tok.is_stop
            is_bad_pos = tok.pos_ in bad_pos

            if is_bad_pos or is_stop: continue

            toks.append(tok.lemma_)

        def get_n_grams(toks, n, sep = " "):
            ''' for a list of one-gram tokens, returns all n-grams '''

            out = []
            _len = len(toks)
            for i in range(_len - n - 1):
                out.append(sep.join(toks[i:i+n]))

            return out

        n_grams = toks[:]
        for n in range(2, max_n_gram+1):
            n_grams += get_n_grams(toks, n=n)

        term_freq = count(n_grams)

        # append to the dataframe
        row = pd.DataFrame()
        row["label"] = [label]
        for k in term_freq:
            row[k] = [term_freq[k]]

        X = pd.concat((X,row))
        
        # output chunks to improve runtime efficiency
        i += 1
        if i % chunk_size == 0:
            X.to_csv(f"temp-data/{i}.csv", index = False)
            X = pd.DataFrame()

    # read and merge chunks
    for i in tqdm(range(chunk_size,_len+1,chunk_size)):
        df_sub = pd.read_csv(f"temp-data/{i}.csv")
        X = pd.concat((X, df_sub))

    # erase all chunks
    run("rm -r temp-data")
    run("mkdir temp-data")
    
    X.fillna(0, inplace = True)

    y, X = X.label, X.drop("label", axis = 1)

    return X, y

process_data = True
if process_data:
    chunk_size = 1000
    X, y = pd.DataFrame(), pd.DataFrame()
    for start_idx in range(0, len(df_combined), chunk_size):
        end_idx = start_idx + chunk_size
        print(f"idx: {start_idx}-{end_idx}")
        X_sub, y_sub = preprocess(df_combined.iloc[start_idx: end_idx])
        X, y = pd.concat((X, X_sub)), pd.concat((y, y_sub))

    X.fillna(0, inplace = True)

idx: 0-1000


1000it [00:59, 16.75it/s]
100%|██████████| 20/20 [00:11<00:00,  1.72it/s]


idx: 1000-2000


1000it [01:01, 16.28it/s]
100%|██████████| 20/20 [00:12<00:00,  1.63it/s]


idx: 2000-3000


1000it [00:58, 17.24it/s]
100%|██████████| 20/20 [00:11<00:00,  1.75it/s]


idx: 3000-4000


1000it [01:01, 16.37it/s]
100%|██████████| 20/20 [00:12<00:00,  1.56it/s]


idx: 4000-5000


1000it [01:00, 16.56it/s]
100%|██████████| 20/20 [00:11<00:00,  1.75it/s]


idx: 5000-6000


1000it [00:57, 17.25it/s]
100%|██████████| 20/20 [00:11<00:00,  1.76it/s]


idx: 6000-7000


1000it [00:58, 17.19it/s]
100%|██████████| 20/20 [00:11<00:00,  1.78it/s]


idx: 7000-8000


1000it [00:59, 16.80it/s]
100%|██████████| 20/20 [00:11<00:00,  1.69it/s]


idx: 8000-9000


1000it [00:58, 17.19it/s]
100%|██████████| 20/20 [00:11<00:00,  1.69it/s]


idx: 9000-10000


1000it [00:58, 17.08it/s]
100%|██████████| 20/20 [00:11<00:00,  1.74it/s]


idx: 10000-11000


1000it [01:01, 16.20it/s]
100%|██████████| 20/20 [00:11<00:00,  1.67it/s]


idx: 11000-12000


1000it [01:00, 16.49it/s]
100%|██████████| 20/20 [00:11<00:00,  1.70it/s]


idx: 12000-13000


1000it [00:58, 17.03it/s]
100%|██████████| 20/20 [00:12<00:00,  1.66it/s]


idx: 13000-14000


233it [00:13, 17.41it/s]
100%|██████████| 4/4 [00:01<00:00,  3.64it/s]


In [143]:
## partition input data into (1) validation, (2) train, and (3) semi-supervised learning sets
# 2037 = the length of the original validation partition

X_v, X_tr, X_ss = X.iloc[:2037], X.iloc[2037:len(df_tr)], X.iloc[len(df_tr):]
y = df_combined[["label"]]
y_v, y_tr, y_ss = y.iloc[:2037], y.iloc[2037:len(df_tr)], y.iloc[len(df_tr):]

### Feature Selection

In [73]:
## feature selection

from math import log

def entropy(x):
  out = 0
  for label in [0,1]:
    p = (x == label).mean()
    if p == 0: continue
    p = p * log(p) / log(2)
    out -= p
  return out

df_ig = pd.DataFrame()

# consolidating data sources
print("Consolidating data sources...")
is_aug = False
if is_aug:
    X_ig = pd.concat((X_tr,X_v)).fillna(0)
    y_ig = pd.concat((y_tr,y_v))
else:
    X_ig, y_ig = X,y

for label in range(0,8):
    print(f"label = {label}")

    # reduce to a two-class problem
    y_ = (y_ig == label).astype(int)
    y_ = y_.to_numpy()
    entropy0 = entropy(y_)

    df = pd.DataFrame()
    scores = np.zeros(len(X_ig.columns))

    for i,col in tqdm(enumerate(X_ig.columns)):

        predictor = X_ig[col]

        y1, y2 = y_[predictor != 0], y_[predictor == 0]
        l0, l1, l2 = len(y_), len(y1), len(y2)

        IG = entropy0 - (l1/l0 * entropy(y1)) - (l2 / l0 * entropy(y2))

        scores[i] = IG

    if len(df_ig) == 0:
        df_ig["col"] = X_ig.columns

    df_ig[f"IG{label}"] = scores

def row_avg(row):
    n_classes = 8

    out = 0
    for label in range(n_classes):
        out += row[f"IG{label}"] / n_classes

    return out

df_ig["IG_ovr"] = df_ig.apply(row_avg, axis = 1)
df_ig.sort_values("IG_ovr", ascending = False)

Consolidating data sources...
label = 0


276580it [00:34, 8062.28it/s]


label = 1


276580it [00:35, 7780.88it/s]


label = 2


276580it [00:35, 7761.41it/s]


label = 3


276580it [00:36, 7673.22it/s]


label = 4


276580it [00:36, 7597.76it/s]


label = 5


276580it [00:36, 7654.23it/s]


label = 6


276580it [00:36, 7572.05it/s]


label = 7


276580it [00:37, 7452.34it/s]


Unnamed: 0,col,IG0,IG1,IG2,IG3,IG4,IG5,IG6,IG7,IG_ovr
745,triangle,2.661484e-02,9.379578e-02,0.009042,0.000878,1.595433e-02,6.060825e-03,0.000447,0.000283,0.019135
1690,angle,2.307671e-02,8.172594e-02,0.005957,0.001515,1.295765e-02,1.128059e-02,0.000074,0.000241,0.017103
1291,Solve,5.834442e-02,1.680542e-02,0.005889,0.002404,9.689066e-03,1.224856e-02,0.000726,0.000580,0.013336
814,area,1.411493e-02,6.480048e-02,0.001539,0.002003,1.041737e-02,1.019673e-02,0.000001,0.000672,0.012968
26,probability,8.086339e-03,6.012319e-03,0.002552,0.078521,4.571190e-03,1.122884e-03,0.000091,0.000256,0.012651
...,...,...,...,...,...,...,...,...,...,...
98910,integer u,2.070815e-06,5.626584e-07,0.000027,0.000019,3.622029e-06,2.351520e-06,0.000006,0.000005,0.000008
28958,z <,7.831249e-06,4.277333e-06,0.000017,0.000022,5.450229e-07,1.209817e-07,0.000007,0.000005,0.000008
14944,wind,7.831249e-06,4.277333e-06,0.000017,0.000022,5.450229e-07,1.209817e-07,0.000007,0.000005,0.000008
67245,leqslant b,1.833787e-08,1.093577e-06,0.000003,0.000032,4.657621e-06,6.773424e-06,0.000009,0.000008,0.000008


In [161]:
# select features, update all datasets accordingly
print("selecting features...")
columns = select_top_n(df_ig, by_class = True, n = 5000)
new_cols = []
for col in tqdm(columns):
    if col in X.columns:
        new_cols.append(col)
columns = new_cols
print("modifying the datasets...")
X_v_, X_tr_, X_ss_ = X_v[columns], X_tr[columns], X_ss[columns]

X_add, y_add = pd.DataFrame(), pd.DataFrame()

# define model class
# model = LGBMClassifier(
#     reg_alpha = 0.5,
#     verbose = 4,
#     max_depth = -1
# )

model = LogisticRegression(C = 0.5)

for i in range(10):

    ovr_metrics = {}

    X_tr1, y_tr1 = pd.concat((X_tr_, X_add)), pd.concat((y_tr, y_add))

    # train the model and render predictions
    print("training the model...")
    model.fit(X_tr1.to_numpy(), y_tr1)
    y_pred_t = model.predict(X_tr1)
    y_pred = model.predict(X_v_)

    # score the model
    print("getting metrics...")
    score = f1(y_v, y_pred, average = "micro")
    score_t = f1(y_tr1, y_pred_t, average = "micro")

    ovr_metrics["test_f1"] = score
    ovr_metrics["train_f1"] = score_t

    retrain = False
    if retrain:
        # retrain model on the full dataset, save the model with metrics
        print("retraining the model on the full dataset...")
        X_combined, y_combined = pd.concat((X_tr_, X_v_)), pd.concat((y_tr, y_v))
        model.fit(X_combined.to_numpy(), y_combined)
        score = round(ovr_metrics["test_f1"],3)

        # identify model architecture via class membership
        arch = "other"
        architectures = {
            "lr" : LogisticRegression,
            "ridge" : RidgeClassifier,
            "lgbm" : LGBMClassifier
        }
        for k,v in architectures.items():
            if isinstance(model, v):
                arch = k

        with open(f"models/{arch}-{score}.pkl", "wb") as f:
            pkl.dump(model, f)
        with open(f"models/{arch}-{score}-cols.pkl", "wb") as f:
            pkl.dump(columns, f)

    print(ovr_metrics)

    ## semi-supervised learning
    df_ss = pd.DataFrame()
    X_ssv, y_ssv = pd.concat((X_ss_, X_v_)), pd.concat((y_ss, y_v))
    X_ssv = X_ssv.reset_index().drop("level_0", axis = 1)

    df_ss["pred_label"] = model.predict(X_ssv)
    df_ss["true_label"] = y_ssv.label.tolist()
    df_ss["confidence"] = model.predict_proba(X_ssv).max(axis = 1)
    df_ss = df_ss[df_ss.confidence >= 0.98] # features with a label accuracy of 97.5%

    X_add = X_ssv.iloc[df_ss.index]
    y_add = df_ss[["pred_label"]].rename({"pred_label" : "label"}, axis = 1)
    print(f"adding {len(y_add)} examples to the train set...")

selecting features...


100%|██████████| 28325/28325 [00:00<00:00, 2328159.69it/s]

modifying the datasets...





training the model...
getting metrics...
{'test_f1': 0.7933235149729995, 'train_f1': 0.961972522080471}
adding 2096 examples to the train set...
training the model...
getting metrics...
{'test_f1': 0.7928325969563083, 'train_f1': 0.9670179547228728}
adding 2153 examples to the train set...
training the model...
getting metrics...
{'test_f1': 0.7879234167893961, 'train_f1': 0.9686559922367782}
adding 2171 examples to the train set...
training the model...


KeyboardInterrupt: 

In [160]:
df_ss

Unnamed: 0,pred_label,true_label,confidence
2,0,-1,0.980104
5,1,-1,0.985114
8,0,-1,0.997987
15,1,-1,1.000000
18,1,-1,1.000000
...,...,...,...
5070,2,2,0.999999
5073,1,1,1.000000
5075,0,0,0.998984
5076,0,0,0.996567


In [157]:
df_ss

Unnamed: 0,pred_label,true_label,confidence
0,0,-1,0.989976
5,1,-1,0.996326
7,1,-1,0.982933
15,1,-1,0.999214
18,1,-1,0.999215
...,...,...,...
5066,0,0,0.990143
5072,0,0,0.991501
5073,1,1,0.999534
5075,0,0,0.992176


In [158]:
y_add

Unnamed: 0,label
0,0
5,1
7,1
15,1
18,1
...,...
5066,0
5072,0
5073,1
5075,0
