In [1]:
import config
import preprocess

import pickle
import pandas as pd
import numpy as np

In [2]:
train = pd.read_pickle(config.TRAIN)
test = pd.read_pickle(config.TRAIN)

X_train = train["problem_statement"]
y_train = train[train.columns[1:]]
X_test = test["problem_statement"]
y_test = test[test.columns[1:]]

In [3]:
X_train

678D     four 1018 long let use function fgn linear con...
915F     second follow let function guarante equal two ...
708B     imposs doesnt equal 000the number output nonne...
256B     row 1in happyth probleminiti number figur get ...
1535E    tweak decreas queri money root price number re...
                               ...                        
990C     second 111 sequenc origin follow transform ans...
1154C    correspondinglyprint containspolycarp optimall...
339C     assum equal 1000in number goe greater left wei...
909D     rightyou would number oper left leftmost color...
582E     count look tupl number variabl oper valu boole...
Name: problem_statement, Length: 6272, dtype: object

In [4]:
y_train
y_test

Unnamed: 0,math,datastructures,graphs,greedy,dp,strings,geometry,constructivealgorithms
678D,1,0,0,0,0,0,0,0
915F,0,1,1,0,0,0,0,0
708B,1,0,0,1,0,0,0,1
256B,1,0,0,0,0,0,0,0
1535E,0,1,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...
990C,0,0,0,0,0,0,0,0
1154C,1,0,0,0,0,0,0,0
339C,0,0,1,1,1,0,0,1
909D,0,1,0,1,0,0,0,0


In [5]:
# change text numerically
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
X_train = TfidfVectorizer(
    max_features=config.MAX_FEATURES).fit_transform(X_train)
X_test = TfidfVectorizer(
    max_features=config.MAX_FEATURES).fit_transform(X_test)

### `sklearn.linear_model` Regressors

In [7]:
import sklearn.linear_model as linear_model
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

from utils import confusion_matrix_reg

tags = y_train.columns
tags_results = {}


regs = [
    ("LinearRegression",linear_model.LinearRegression()),
    ("LogisticRegression",linear_model.LogisticRegression(
        random_state=config.SEED)),
    ("RidgeClassifier",linear_model.RidgeClassifier()),
    ("SVC",SVC(
        random_state=config.SEED)),
]


for tag in tags:
    tags_results[tag] = pd.DataFrame(
        columns=["accuracy","precision","recall","f1"],)
    for name, reg in regs:
        # print(X_train.shape, y_train.math.dtype, y_train["math"].shape)
        y_train = y_train.astype("int")
        reg.fit(X_train, y_train[tag])
        y_pred_math = reg.predict(X_test)
        print(tag,name+"...")
        tp,tn,fp,fn = confusion_matrix_reg(
            y_test[tag],y_pred_math, 0.7)
        # print(f"accuracy:{(tp+tn)/(tp+tn+fp+fn)}")
        # print(f"recall:{tp/(tp+fn)}")
        # print(f"precision:{tp/(tp+fp)}")
        acc = (tp+tn)/(tp+tn+fp+fn)
        recall = tp/(tp+fn) 
        preci = tp/(tp+fp)
        tags_results[tag].loc[name,"accuracy"] = acc
        tags_results[tag].loc[name,"precision"] = preci
        tags_results[tag].loc[name,"recall"] = recall

tags_results

math LinearRegression...
math LogisticRegression...
math RidgeClassifier...
math SVC...
datastructures LinearRegression...
datastructures LogisticRegression...
datastructures RidgeClassifier...
datastructures SVC...
graphs LinearRegression...
graphs LogisticRegression...
graphs RidgeClassifier...
graphs SVC...
greedy LinearRegression...
greedy LogisticRegression...
greedy RidgeClassifier...
greedy SVC...
dp LinearRegression...
dp LogisticRegression...
dp RidgeClassifier...
dp SVC...
strings LinearRegression...
strings LogisticRegression...
strings RidgeClassifier...
strings SVC...
geometry LinearRegression...
geometry LogisticRegression...
geometry RidgeClassifier...
geometry SVC...
constructivealgorithms LinearRegression...
constructivealgorithms LogisticRegression...
constructivealgorithms RidgeClassifier...
constructivealgorithms SVC...


{'math':                     accuracy precision    recall   f1
 LinearRegression    0.754783  0.797203   0.07024  NaN
 LogisticRegression  0.786671  0.702703  0.304375  NaN
 RidgeClassifier     0.787309  0.700974  0.310536  NaN
 SVC                 0.917251  0.979167  0.695009  NaN,
 'datastructures':                     accuracy precision    recall   f1
 LinearRegression    0.841199  0.884615  0.044402  NaN
 LogisticRegression  0.856983  0.749104  0.201737  NaN
 RidgeClassifier     0.855548  0.748092  0.189189  NaN
 SVC                 0.916773  0.990458  0.500965  NaN,
 'graphs':                     accuracy precision    recall   f1
 LinearRegression    0.897959  0.894231  0.128809  NaN
 LogisticRegression  0.911671      0.75   0.34903  NaN
 RidgeClassifier     0.911352  0.735795  0.358726  NaN
 SVC                 0.962213  0.967245  0.695291  NaN,
 'greedy':                     accuracy precision    recall   f1
 LinearRegression     0.75845  0.865169  0.048734  NaN
 LogisticRegress

In [8]:
tags_results["dp"]

Unnamed: 0,accuracy,precision,recall,f1
LinearRegression,0.791932,0.759259,0.030758,
LogisticRegression,0.805006,0.682119,0.154539,
RidgeClassifier,0.805325,0.664706,0.169542,
SVC,0.892379,0.982405,0.502626,


In [9]:
tags_results["math"]

Unnamed: 0,accuracy,precision,recall,f1
LinearRegression,0.754783,0.797203,0.07024,
LogisticRegression,0.786671,0.702703,0.304375,
RidgeClassifier,0.787309,0.700974,0.310536,
SVC,0.917251,0.979167,0.695009,


In [10]:
tags_results["greedy"]

Unnamed: 0,accuracy,precision,recall,f1
LinearRegression,0.75845,0.865169,0.048734,
LogisticRegression,0.786511,0.681203,0.286709,
RidgeClassifier,0.789222,0.686957,0.3,
SVC,0.914381,0.986021,0.66962,


In [11]:
tags_results["geometry"]

Unnamed: 0,accuracy,precision,recall,f1
LinearRegression,0.95727,1.0,0.053004,
LogisticRegression,0.970185,0.893443,0.385159,
RidgeClassifier,0.967474,0.89899,0.314488,
SVC,0.98868,0.995327,0.75265,


In [12]:
tags_results["constructivealgorithms"]

Unnamed: 0,accuracy,precision,recall,f1
LinearRegression,0.85236,1.0,0.019068,
LogisticRegression,0.865115,0.705882,0.177966,
RidgeClassifier,0.859694,0.686047,0.125,
SVC,0.915816,0.997608,0.441737,
