## Imports, data prep

In [2]:
import config
import preprocess

import pickle
import pandas as pd
import numpy as np

np.random.seed(config.SEED)
train = pd.read_pickle(config.TRAIN)
test = pd.read_pickle(config.TRAIN)

X_train = train["problem_statement"]
y_train = train[train.columns[1:]]
X_test = test["problem_statement"]
y_test = test[test.columns[1:]]

In [2]:
X_train

850B      element possibl 17a allow inform common arpa t...
587B      linein extrem case consist 000 strang numberth...
65C       direct veloc possibl devic quidditch allow inf...
1440C1    case consist easi possibl cell string allow th...
1185B     consist pairsth possibl string appear press gu...
                                ...                        
1034D     length exact second union calcul help getfor s...
1166D     valid hold consist possibl satisfi posit itth ...
786A      element case endsimilar possibl initi arrang i...
914F      typeal lettersth case consist string initi pro...
766A      consist charactersif name string mahmoud appea...
Name: problem_statement, Length: 6143, dtype: object

In [3]:
y_train

Unnamed: 0,implementation,math,greedy,dp,datastructures,bruteforce,graphs,strings,dfsandsimilar,trees,geometry
850B,1,0,0,0,0,0,0,0,0,0,0
587B,0,0,0,1,0,0,0,0,0,0,0
65C,0,0,0,0,0,0,0,0,0,0,1
1440C1,1,0,0,0,0,0,0,0,0,0,0
1185B,1,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1034D,0,0,0,0,1,0,0,0,0,0,0
1166D,0,1,1,0,0,1,0,0,0,0,0
786A,0,0,0,1,0,0,0,0,1,0,0
914F,0,0,0,0,1,1,0,1,0,0,0


In [4]:
# vectorize text
from sklearn.feature_extraction.text import TfidfVectorizer
X_train = TfidfVectorizer(
    max_features=config.MAX_FEATURES).fit_transform(X_train)
X_test = TfidfVectorizer(
    max_features=config.MAX_FEATURES).fit_transform(X_test)

# y_train as int for fitting
y_train = y_train.astype("int")

## Classifiers

In [5]:
import sklearn.linear_model as linear_model
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
import xgboost as xgb

from utils import confusion_matrix_reg

tags = y_train.columns
tags_results = {}
clfs = [
    ("LinearRegression",linear_model.LinearRegression()),
    ("LogisticRegression",linear_model.LogisticRegression(
        random_state=config.SEED)),
    ("RidgeClassifier",linear_model.RidgeClassifier(
        random_state=config.SEED)),
    ("SVC",SVC(
        random_state=config.SEED)),
    ("XGBClassifier", xgb.XGBClassifier(
        random_state=config.SEED)),
    ("KNeighborsClassifier",KNeighborsClassifier()),
    ("DummyClassifier",DummyClassifier(
        strategy="uniform",random_state=config.SEED)),
]

In [6]:
%%time

fitted_clfs = []

for tag in tags:
    tags_results[tag] = pd.DataFrame(
        columns=["accuracy","precision","recall","f1-score"])
    print(tag+"...")
    for name, clf in clfs:
        clf.fit(X_train, y_train[tag])
        y_pred = clf.predict(X_test)
        # print(tag,name+"...")
        tp,tn,fp,fn = confusion_matrix_reg(
            y_test[tag],y_pred, 0.7)

        # calculate metrics
        # save metrics
        try:
            acc = (tp+tn)/(tp+tn+fp+fn)
            tags_results[tag].loc[name,"accuracy"] = acc
        except: pass
        try:
            preci = tp/(tp+fp)
            tags_results[tag].loc[name,"precision"] = preci
        except: pass
        try:
            recall = tp/(tp+fn) 
            tags_results[tag].loc[name,"recall"] = recall
        except: pass
        try:
            tags_results[tag].loc[name,"f1-score"] = 2*(preci*recall)/(preci+recall)
        except: pass

        # append fitted (clf,acc)
        fitted_clfs.append((clf,acc))

implementation...
math...
greedy...
dp...
datastructures...
bruteforce...
graphs...
strings...
dfsandsimilar...
trees...
geometry...
CPU times: user 6min 59s, sys: 7.64 s, total: 7min 7s
Wall time: 3min 2s


In [7]:
for tag in tags:
    print(tag)
    display(tags_results[tag])

implementation


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.715448,0.727273,0.004564,0.00907
LogisticRegression,0.725053,0.593567,0.115801,0.193795
RidgeClassifier,0.725704,0.60625,0.110667,0.187168
SVC,0.832492,0.987871,0.41814,0.587575
XGBClassifier,0.971187,0.98642,0.91158,0.947524
KNeighborsClassifier,0.784796,0.717019,0.406161,0.518572
DummyClassifier,0.497477,0.282026,0.492299,0.358612


math


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.738401,0.833333,0.018394,0.035993
LogisticRegression,0.758261,0.622483,0.227468,0.333184
RidgeClassifier,0.753703,0.613462,0.195586,0.296606
SVC,0.883445,0.970195,0.578786,0.725038
XGBClassifier,0.98307,0.981096,0.954629,0.967682
KNeighborsClassifier,0.817516,0.754491,0.463519,0.57425
DummyClassifier,0.499756,0.264379,0.496015,0.344916


greedy


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.752076,0.863636,0.012346,0.024343
LogisticRegression,0.76575,0.59542,0.202729,0.302472
RidgeClassifier,0.762331,0.596107,0.159194,0.251282
SVC,0.873352,0.981037,0.504224,0.666094
XGBClassifier,0.980791,0.983005,0.939571,0.960797
KNeighborsClassifier,0.82712,0.730435,0.491228,0.587413
DummyClassifier,0.488686,0.238235,0.473684,0.317025


dp


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.779098,,0.0,0.0
LogisticRegression,0.780563,0.547368,0.03832,0.071625
RidgeClassifier,0.7804,0.642857,0.013265,0.025993
SVC,0.845515,0.997561,0.3014,0.462932
XGBClassifier,0.986163,0.990741,0.946205,0.967961
KNeighborsClassifier,0.829399,0.711354,0.383198,0.498084
DummyClassifier,0.501709,0.221569,0.499632,0.306996


datastructures


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.83526,,0.0,0.0
LogisticRegression,0.839329,0.598425,0.075099,0.13345
RidgeClassifier,0.839004,0.744681,0.034585,0.0661
SVC,0.881654,0.996516,0.282609,0.440339
XGBClassifier,0.987628,0.9875,0.936759,0.96146
KNeighborsClassifier,0.868305,0.734411,0.314229,0.440138
DummyClassifier,0.501546,0.165033,0.499012,0.248035


bruteforce


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.849748,,0.0,0.0
LogisticRegression,0.849748,,0.0,0.0
RidgeClassifier,0.849748,,0.0,0.0
SVC,0.858864,1.0,0.060672,0.114402
XGBClassifier,0.977047,0.98875,0.856988,0.918166
KNeighborsClassifier,0.860817,0.645299,0.163597,0.26102
DummyClassifier,0.504314,0.153268,0.508126,0.235501


graphs


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.886538,,0.0,0.0
LogisticRegression,0.896793,0.726619,0.144907,0.241627
RidgeClassifier,0.88784,0.75,0.017217,0.033661
SVC,0.935862,0.990291,0.439024,0.60835
XGBClassifier,0.995279,0.985465,0.97274,0.979061
KNeighborsClassifier,0.903793,0.648876,0.33142,0.438746
DummyClassifier,0.500081,0.112092,0.492109,0.182592


strings


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.923816,0.727273,0.016913,0.033058
LogisticRegression,0.936839,0.65233,0.384778,0.484043
RidgeClassifier,0.932606,0.67052,0.245243,0.359133
SVC,0.97607,0.960452,0.718816,0.822249
XGBClassifier,0.996419,0.98081,0.972516,0.976645
KNeighborsClassifier,0.950676,0.715736,0.596195,0.650519
DummyClassifier,0.505291,0.080719,0.522199,0.139825


dfsandsimilar


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.908188,,0.0,0.0
LogisticRegression,0.910142,0.611111,0.058511,0.106796
RidgeClassifier,0.908514,1.0,0.003546,0.007067
SVC,0.92756,1.0,0.210993,0.348463
XGBClassifier,0.991861,0.970696,0.939716,0.954955
KNeighborsClassifier,0.920072,0.697297,0.228723,0.344459
DummyClassifier,0.498291,0.088562,0.480496,0.149558


trees


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.921862,,0.0,0.0
LogisticRegression,0.924141,0.659091,0.060417,0.110687
RidgeClassifier,0.921862,,0.0,0.0
SVC,0.945304,1.0,0.3,0.461538
XGBClassifier,0.997721,0.993644,0.977083,0.985294
KNeighborsClassifier,0.938955,0.79661,0.29375,0.429224
DummyClassifier,0.502523,0.079085,0.504167,0.136723


geometry


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.955722,,0.0,0.0
LogisticRegression,0.960606,0.767857,0.158088,0.262195
RidgeClassifier,0.956048,1.0,0.007353,0.014599
SVC,0.978187,1.0,0.507353,0.673171
XGBClassifier,0.99886,0.992565,0.981618,0.987061
KNeighborsClassifier,0.967443,0.805085,0.349265,0.487179
DummyClassifier,0.508709,0.051307,0.577206,0.094238


In [8]:
best_clf = xgb.XGBClassifier(random_state=config.SEED)
best_clf

In [9]:
%%time
from preprocess import preprocess_text

X_test = [
# https://codeforces.com/problemset/problem/1765/N
# greedy
"""
You are given a positive integer x.

You can apply the following operation to the number: remove one occurrence of any digit in such a way that the resulting number does not contain any leading zeroes and is still a positive integer. For example, 10142 can be converted to 1142, 1042, 1012 or 1014 (note that 0142 is not a valid outcome); 10 can be converted to 1 (but not to 0 since it is not positive).

Your task is to find the minimum positive integer that you can obtain from x if you can apply the aforementioned operation exactly k times.
"""
]
X_test = [preprocess_text(x) for x in X_test]
X_test = TfidfVectorizer(
    max_features=config.MAX_FEATURES).fit_transform(X_test)
X_test = [np.pad(X_test.toarray(),(0,config.MAX_FEATURES-X_test.shape[1]))[0]]

name,best_clf = clfs[4]
y_preds = []
for tag in tags:
    best_clf.fit(X_train, y_train[tag])
    y_pred = best_clf.predict(X_test)
    y_preds.append((tag,y_pred))
    # if y_pred==[1]: print(tag)
print(*y_preds,sep='\n')


('implementation', array([0]))
('math', array([0]))
('greedy', array([0]))
('dp', array([0]))
('datastructures', array([1]))
('bruteforce', array([0]))
('graphs', array([0]))
('strings', array([0]))
('dfsandsimilar', array([0]))
('trees', array([1]))
('geometry', array([0]))
CPU times: user 4min 9s, sys: 2.51 s, total: 4min 11s
Wall time: 31.8 s


## Regressors

In [4]:
np.random.seed(config.SEED)
train = pd.read_pickle(config.TRAIN)
test = pd.read_pickle(config.TRAIN)

X_train = train["problem_statement"]
y_train = train[train.columns[1:]]
X_test = test["problem_statement"]
y_test = test[test.columns[1:]]

# vectorize text
from sklearn.feature_extraction.text import TfidfVectorizer
X_train = TfidfVectorizer(
    max_features=config.MAX_FEATURES).fit_transform(X_train)
X_test = TfidfVectorizer(
    max_features=config.MAX_FEATURES).fit_transform(X_test)

# y_train as int for fitting
y_train = y_train.astype("int")

In [5]:
import sklearn.linear_model as linear_model
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

from utils import confusion_matrix_reg

tags = y_train.columns
tags_results = {}
regs = [
    ("SVR", SVR()),
    ("XGBRegressor", xgb.XGBRegressor(
        random_state=config.SEED)),
    ("KNeighborsRegressor", KNeighborsRegressor()),
    ("RandomForest",RandomForestRegressor(
        random_state=config.SEED)),
    # ("DummyRegressor", DummyRegressor()),
]

In [6]:
%%time
# choose top-k tags
reg = KNeighborsRegressor()
y_preds = []

for tag in tags:
    reg.fit(X_train, y_train[tag])
    y_pred = reg.predict(X_test)
    confusion_matrix_reg(y_test,y_pred)
    y_preds.append(y_pred)

y_preds = np.array(y_preds).T
y_test = np.array(y_test)
y_preds.shape, y_test.shape

CPU times: user 24.4 s, sys: 3.18 s, total: 27.6 s
Wall time: 27.8 s


((6143, 11), (6143, 11))

In [7]:
# check first 10 results
for i in range(10):
    print(y_preds[i], y_test[i])

[0.4 0.  0.4 0.  0.2 0.  0.4 0.  0.  0.  0. ] [1 0 0 0 0 0 0 0 0 0 0]
[0.4 0.6 0.4 0.2 0.  0.2 0.  0.  0.  0.  0. ] [0 0 0 1 0 0 0 0 0 0 0]
[0.4 0.  0.  0.2 0.  0.2 0.  0.  0.  0.  0.4] [0 0 0 0 0 0 0 0 0 0 1]
[0.8 0.  0.  0.  0.6 0.  0.  0.6 0.  0.  0. ] [1 0 0 0 0 0 0 0 0 0 0]
[0.8 0.  0.2 0.  0.  0.2 0.2 0.8 0.  0.  0. ] [1 0 0 0 0 0 0 1 0 0 0]
[0.2 0.2 0.4 0.8 0.4 0.  0.  0.  0.  0.  0. ] [1 0 0 1 1 0 0 0 0 0 0]
[0.4 0.2 0.8 0.2 0.  0.2 0.  0.  0.  0.  0. ] [0 0 1 0 0 0 0 0 0 0 0]
[0.4 0.2 0.4 0.2 0.4 0.  0.  0.  0.  0.  0. ] [0 0 0 1 0 0 0 0 0 0 0]
[0.4 0.  0.2 0.  0.  0.4 0.2 0.4 0.2 0.  0. ] [0 0 0 0 0 0 1 0 0 0 0]
[0.4 0.  0.4 0.6 0.2 0.4 0.2 0.4 0.2 0.  0. ] [1 0 1 1 0 0 0 0 0 0 0]


In [8]:
# doesnt work for multilabled data
## https://scikit-learn.org/stable/modules/model_evaluation.html#top-k-accuracy-score
from sklearn.metrics import top_k_accuracy_score
# top_k_accuracy_score(y_test,y_preds, k=2)

In [9]:
from sklearn.metrics import coverage_error
# https://scikit-learn.org/stable/modules/model_evaluation.html#coverage-error
coverage_error(y_test,y_preds)

3.4212925280807425

In [10]:
%%time

tags_results_regs = {}
fitted_regs = []

for tag in tags:
    tags_results_regs[tag] = pd.DataFrame(
        columns=["accuracy","precision","recall","f1-score"])
    print(tag+"...")
    for name, reg in regs:
        reg.fit(X_train, y_train[tag])
        y_pred = reg.predict(X_test)
        tp,tn,fp,fn = confusion_matrix_reg(
            y_test[tag],y_pred, 0.7)

        try:
            acc = (tp+tn)/(tp+tn+fp+fn)
            tags_results_regs[tag].loc[name,"accuracy"] = acc
        except: pass
        try:
            preci = tp/(tp+fp)
            tags_results_regs[tag].loc[name,"precision"] = preci
        except: pass
        try:
            recall = tp/(tp+fn) 
            tags_results_regs[tag].loc[name,"recall"] = recall
        except: pass
        try:
            tags_results_regs[tag].loc[name,"f1-score"] = 2*(preci*recall)/(preci+recall)
        except: pass

        # append fitted (reg,acc)
        fitted_regs.append((reg,acc))

implementation...


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices