## Imports, data prep

In [1]:
import config
import preprocess

import pickle
import pandas as pd
import numpy as np

np.random.seed(config.SEED)
train = pd.read_pickle(config.TRAIN)
test = pd.read_pickle(config.TRAIN)

X_train = train["problem_statement"]
y_train = train[train.columns[1:]]
X_test = test["problem_statement"]
y_test = test[test.columns[1:]]

In [2]:
X_train

678D     long g0x java given typeprint find onli defin ...
915F     106 second given path vertic number two differ...
708B     000the given nonneg number string find least d...
256B     number least happyth field get top singl one e...
1535E    java given standard number onlin mean buffered...
                               ...                        
990C     exceed second given number string insert oper ...
1154C    number possibl three rabbit eatenin one exampl...
339C     specifi play given number string consist help ...
909D     given number string leftmost dont arrang singl...
582E     given number mean element boolean also corresp...
Name: problem_statement, Length: 6272, dtype: object

In [3]:
y_train

Unnamed: 0,math,datastructures,graphs,greedy,dp,strings,geometry,constructivealgorithms
678D,1,0,0,0,0,0,0,0
915F,0,1,1,0,0,0,0,0
708B,1,0,0,1,0,0,0,1
256B,1,0,0,0,0,0,0,0
1535E,0,1,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...
990C,0,0,0,0,0,0,0,0
1154C,1,0,0,0,0,0,0,0
339C,0,0,1,1,1,0,0,1
909D,0,1,0,1,0,0,0,0


In [4]:
# vectorize text
from sklearn.feature_extraction.text import TfidfVectorizer
X_train = TfidfVectorizer(
    max_features=config.MAX_FEATURES).fit_transform(X_train)
X_test = TfidfVectorizer(
    max_features=config.MAX_FEATURES).fit_transform(X_test)

# y_train as int for fitting
y_train = y_train.astype("int")

## Models

In [5]:
%%time

import sklearn.linear_model as linear_model
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
import xgboost as xgb

from utils import confusion_matrix_reg

tags = y_train.columns
tags_results = {}
clfs = [
    ("LinearRegression",linear_model.LinearRegression()),
    ("LogisticRegression",linear_model.LogisticRegression(
        random_state=config.SEED)),
    ("RidgeClassifier",linear_model.RidgeClassifier(
        random_state=config.SEED)),
    ("SVC",SVC(
        random_state=config.SEED)),
    ("XGBClassifier", xgb.XGBClassifier(
        random_state=config.SEED)),
    ("KNeighborsClassifier",KNeighborsClassifier()),
    ("DummyClassifier",DummyClassifier(
        strategy="uniform",random_state=config.SEED)),
]

for tag in tags:
    tags_results[tag] = pd.DataFrame(
        columns=["accuracy","precision","recall","f1-score"])
    print(tag+"...")
    for name, clf in clfs:
        clf.fit(X_train, y_train[tag])
        y_pred_math = clf.predict(X_test)
        # print(tag,name+"...")
        tp,tn,fp,fn = confusion_matrix_reg(
            y_test[tag],y_pred_math, 0.7)

        acc = (tp+tn)/(tp+tn+fp+fn)
        recall = tp/(tp+fn) 
        preci = tp/(tp+fp)
        tags_results[tag].loc[name,"accuracy"] = acc
        tags_results[tag].loc[name,"precision"] = preci
        tags_results[tag].loc[name,"recall"] = recall
        tags_results[tag].loc[name,"f1-score"] = 2*(preci*recall)/(preci+recall)

math...
datastructures...
graphs...
greedy...
dp...
strings...
geometry...
constructivealgorithms...
CPU times: user 15min 36s, sys: 23.3 s, total: 15min 59s
Wall time: 5min 1s


precision: 0인데 1로 예측하는걸 줄이는지  
recall: 1인데 0으로 예측하는걸 줄이는지 확인  

precision>recall on all tags   
= 아닌걸 맞다고 예측하는건 거의 없다.  
= 맞는걸 아니라고 예측하는 경우가 좀 더 많다.

In [6]:
for tag in tags:
    print(tag)
    display(tags_results[tag])

math


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.754783,0.797203,0.07024,0.129105
LogisticRegression,0.786671,0.702703,0.304375,0.424764
RidgeClassifier,0.787309,0.700974,0.310536,0.430401
SVC,0.917251,0.979167,0.695009,0.812973
XGBClassifier,0.98007,0.981362,0.94085,0.960679
KNeighborsClassifier,0.757175,0.890625,0.07024,0.130211
DummyClassifier,0.493463,0.251598,0.484904,0.331299


datastructures


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.841199,0.884615,0.044402,0.084559
LogisticRegression,0.856983,0.749104,0.201737,0.317871
RidgeClassifier,0.855548,0.748092,0.189189,0.302003
SVC,0.916773,0.990458,0.500965,0.665385
XGBClassifier,0.986288,0.978831,0.937259,0.957594
KNeighborsClassifier,0.844866,0.879518,0.070463,0.130474
DummyClassifier,0.497449,0.161765,0.488417,0.243036


graphs


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.897959,0.894231,0.128809,0.225182
LogisticRegression,0.911671,0.75,0.34903,0.476371
RidgeClassifier,0.911352,0.735795,0.358726,0.482309
SVC,0.962213,0.967245,0.695291,0.809025
XGBClassifier,0.995695,0.984658,0.977839,0.981237
KNeighborsClassifier,0.897321,0.882353,0.124654,0.218447
DummyClassifier,0.508291,0.122442,0.530471,0.198961


greedy


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.75845,0.865169,0.048734,0.092271
LogisticRegression,0.786511,0.681203,0.286709,0.403563
RidgeClassifier,0.789222,0.686957,0.3,0.417621
SVC,0.914381,0.986021,0.66962,0.797588
XGBClassifier,0.981505,0.983487,0.942405,0.962508
KNeighborsClassifier,0.771205,0.887701,0.105063,0.187889
DummyClassifier,0.501594,0.252877,0.500633,0.336024


dp


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.791932,0.759259,0.030758,0.05912
LogisticRegression,0.805006,0.682119,0.154539,0.251988
RidgeClassifier,0.805325,0.664706,0.169542,0.270173
SVC,0.892379,0.982405,0.502626,0.665012
XGBClassifier,0.984694,0.988161,0.939235,0.963077
KNeighborsClassifier,0.801977,0.858268,0.08177,0.149315
DummyClassifier,0.500159,0.211957,0.497374,0.297243


strings


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.93479,0.833333,0.150538,0.255009
LogisticRegression,0.950733,0.767123,0.48172,0.59181
RidgeClassifier,0.950096,0.72093,0.533333,0.613103
SVC,0.981186,0.941476,0.795699,0.862471
XGBClassifier,0.996652,0.984716,0.969892,0.977248
KNeighborsClassifier,0.938297,0.854545,0.202151,0.326957
DummyClassifier,0.498246,0.071292,0.47957,0.12413


geometry


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.95727,1.0,0.053004,0.100671
LogisticRegression,0.970185,0.893443,0.385159,0.538272
RidgeClassifier,0.967474,0.89899,0.314488,0.465969
SVC,0.98868,0.995327,0.75265,0.857143
XGBClassifier,0.998565,0.992806,0.975265,0.983957
KNeighborsClassifier,0.96014,0.883721,0.134276,0.233129
DummyClassifier,0.501435,0.045396,0.501767,0.08326


constructivealgorithms


Unnamed: 0,accuracy,precision,recall,f1-score
LinearRegression,0.85236,1.0,0.019068,0.037422
LogisticRegression,0.865115,0.705882,0.177966,0.284264
RidgeClassifier,0.859694,0.686047,0.125,0.21147
SVC,0.915816,0.997608,0.441737,0.612335
XGBClassifier,0.984056,0.969933,0.922669,0.945711
KNeighborsClassifier,0.861767,0.818182,0.104873,0.185915
DummyClassifier,0.492666,0.142263,0.471398,0.218566
