## Imports

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score
from tpot import TPOTClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from jcopml.tuning import bayes_search_params as bsp

## Data Import and Ptfidfvectorizer

In [7]:
### read data
data = pd.read_csv("/Users/timcerta/code/jbaccarin/xref/raw_data/gcj2008.csv")
# Remove NAs
data = data.dropna()
# Remove code with less than x characters
data = data.loc[data['flines'].str.len() > 5]
# Remove users with entries < 25
data["username"].value_counts()
data = data[data['username'].map(data['username'].value_counts()) > 25].reset_index(drop = True)
# when there are more than 1 submissions, keep only the last one
data = data.drop_duplicates(subset=['year', 'round', 'username', 'task'], keep='first')


## Overview

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,year,round,username,task,solution,file,full_path,flines
0,20,2008,32002,KOTEHOK,24445,0,x.java,gcj/2008/32002/KOTEHOK/24445/0/extracted/x.java,import java.util.*;\n import java.io.*;\n \n p...
1,21,2008,32002,KOTEHOK,24443,1,x.java,gcj/2008/32002/KOTEHOK/24443/1/extracted/x.java,import java.util.*;\n import java.io.*;\n \n p...
3,23,2008,32002,KOTEHOK,24444,0,x.java,gcj/2008/32002/KOTEHOK/24444/0/extracted/x.java,import java.util.*;\n import java.io.*;\n \n p...
4,24,2008,32002,KOTEHOK,24446,0,x.java,gcj/2008/32002/KOTEHOK/24446/0/extracted/x.java,import java.util.*;\n import java.io.*;\n \n p...
5,104,2008,32002,Huayang,24445,1,p.cpp,gcj/2008/32002/Huayang/24445/1/extracted/p.cpp,#include <iostream>\n #include <cmath>\n #incl...


In [223]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2130 entries, 0 to 3890
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2130 non-null   int64 
 1   year        2130 non-null   int64 
 2   round       2130 non-null   int64 
 3   username    2130 non-null   object
 4   task        2130 non-null   int64 
 5   solution    2130 non-null   int64 
 6   file        2130 non-null   object
 7   full_path   2130 non-null   object
 8   flines      2130 non-null   object
dtypes: int64(5), object(4)
memory usage: 166.4+ KB


# Modeling tfidfvectorizeraration

## tfidfvectorizerare X and y

In [12]:
target_encoder = LabelEncoder().fit(data['username']) 
y = target_encoder.transform(data['username'])
y

array([23, 23, 23, ..., 25, 25, 25])

In [11]:
X = data["flines"]
X

0       import java.util.*;\n import java.io.*;\n \n p...
1       import java.util.*;\n import java.io.*;\n \n p...
3       import java.util.*;\n import java.io.*;\n \n p...
4       import java.util.*;\n import java.io.*;\n \n p...
5       #include <iostream>\n #include <cmath>\n #incl...
                              ...                        
3885    #include <iostream>\n #include <string>\n \n u...
3886    #include <iostream>\n #include <algorithm>\n \...
3887    import sys\n import math\n \n MOD = 1000000007...
3888    import sys\n import math\n \n def do_test(inpu...
3890    import sys\n import math\n \n def do_test(inpu...
Name: flines, Length: 2130, dtype: object

## Build TfIdf
Actually, we don't need it, since we put the vectorizer in a pipeline

In [228]:
# Instantiating the TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(min_df = 5)

# Training it on the texts
X_tfidf = pd.DataFrame(tf_idf_vectorizer.fit_transform(X).toarray(),
                 columns = tf_idf_vectorizer.get_feature_names_out())

X_tfidf

Unnamed: 0,00,000,0000,000000,01,02,027,02d,03,03d,...,zexp,zfact,zinv,zip,zipwith,zlog,zmul,zpow,zz,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [233]:
# ngram tfidf
tf_idf_vectorizer = TfidfVectorizer(min_df = 5, analyzer = "word", ngram_range =(1,6))

# Training it on the texts
X_ngram_tfidf = pd.DataFrame(tf_idf_vectorizer.fit_transform(X).toarray(),
                 columns = tf_idf_vectorizer.get_feature_names_out())

X_ngram_tfidf

Unnamed: 0,00,000,000 000,000 000 takes,000 000 takes about,000 000 takes about 5s,000 000 takes about 5s bool,000 takes,000 takes about,000 takes about 5s,...,zz fprintf,zz fprintf stderr,zz fprintf stderr working,zz fprintf stderr working on,zz fprintf stderr working on zz,zz int,zz res,zz result,zz scanf,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [230]:
# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.33, random_state=42)

In [231]:
nb = MultinomialNB()


cv_results = cross_val_score(nb, X_test, y_test, cv = 5, error_score='raise')
print(cv_results)
average_recall = cv_results.mean()
np.round(average_recall,2)



[0.35460993 0.31914894 0.40425532 0.42142857 0.37857143]


0.38

In [232]:
nb.fit(X_train, y_train)

nb.fit(X_train,y_train)
res = nb.score(X_test,y_test)
#nb.predict(X_test)
res

0.49359886201991465

In [9]:
# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X_ngram_tfidf, y, test_size=0.33)

NameError: name 'X_ngram_tfidf' is not defined

# Modeling: Naive Bayes

## instantiate NB model and get cross validation score

In [235]:
nb = MultinomialNB()

cv_results = cross_val_score(nb, X_train, y_train, cv = 5, error_score='raise')
print(cv_results)
average_recall = cv_results.mean()
np.round(average_recall,2)



[0.82517483 0.82867133 0.83859649 0.84210526 0.82105263]


0.83

## Fit NB model & get score

In [236]:
nb.fit(X_train,y_train)
res = nb.score(X_test,y_test)
res

0.8193456614509246

In [262]:
pred = nb.predict(X_test)




ValueError: could not convert string to float: '#include <iostream>\n #include <fstream>\n #include <sstream>\n #include <vector>\n #include <string>\n #include <algorithm>\n #include <cmath>\n //#include <utility>\n //#include <set>\n //#include <map>\n //#include <queue>\n using namespace std;\n \n #define mset(A,B) memset(A,B,sizeof(A));\n #define mcpy(A,B) memcpy(A,B,sizeof(B));\n typedef long long ll;\n typedef long double ld;\n typedef vector<int> vint;\n //typedef vector<string> vstr;\n #define FI(I,L,U) for (int I=L;I<U;I++)\n #define sqr(x) ((x)*(x))\n \n int a[11000][2];\n int g[11000];\n int c[11000];\n \n inline void upd(int& x, int c1, int c2, int q)\n {\n \tif (c1 >= 0 && c2 >= 0)\n \t{\n \t\tq += c1+c2;\n \t\tif (x < 0 || q < x) x = q;\n \t}\n }\n \n int main()\n {\n \tint tcase = 0;\n \tifstream fin("z.in");\n \tofstream fout("z.out");\n \tfin >> tcase;\n \tfor (int tind = 1; tind <= tcase; tind++)\n \t{\n \t\t//istringstream strin();\n \t\tint m,v;\n \t\tfin >> m >> v;\n \t\tmset(a, 255);\n \t\tFI(i,1,(m-1)/2+1)\n \t\t\tfin >> g[i] >> c[i];\n \t\tint j = (m-1)/2+1;\n \t\tint k;\n \t\tFI(i,0,(m+1)/2)\n \t\t{\n \t\t\tfin >> k;\n \t\t\ta[i+j][k] = 0;\n \t\t}\n \t\tfor (int i = (m-1)/2; i >= 1; i--)\n \t\t{\n \t\t\tif (g[i] == 1 || c[i] == 1)\n \t\t\t{\t// and\n \t\t\t\tint q = 0;\n \t\t\t\tif (g[i] != 1) q = 1;\n \t\t\t\tupd(a[i][0], a[i*2][0], a[i*2+1][0], q);\n \t\t\t\tupd(a[i][0], a[i*2][0], a[i*2+1][1], q);\n \t\t\t\tupd(a[i][0], a[i*2][1], a[i*2+1][0], q);\n \t\t\t\tupd(a[i][1], a[i*2][1], a[i*2+1][1], q);\n \t\t\t}\n \t\t\tif (g[i] == 0 || c[i] == 1)\n \t\t\t{\t// or\n \t\t\t\tint q = 0;\n \t\t\t\tif (g[i] != 0) q = 1;\n \t\t\t\tupd(a[i][0], a[i*2][0], a[i*2+1][0], q);\n \t\t\t\tupd(a[i][1], a[i*2][0], a[i*2+1][1], q);\n \t\t\t\tupd(a[i][1], a[i*2][1], a[i*2+1][0], q);\n \t\t\t\tupd(a[i][1], a[i*2][1], a[i*2+1][1], q);\n \t\t\t}\n \t\t}\n \n \t\tint ans = a[1][v];\n \t\tif (ans < 0)\n \t\t\tfout << "Case #" << tind << ": " << "IMPOSSIBLE" << endl;\n \t\telse\n \t\t\tfout << "Case #" << tind << ": " << ans << endl;\n \t}\n \treturn 0;\n }\n'

In [238]:
y_test

array([109,  17,  66,  43,  38,  71, 105,  77,   8,  38, 111,  95, 115,
       104,  51,  64,  89,  22,  13,  43,  23, 119,  11,  96,  11, 115,
        63,  90,  91,  90, 114,  54,  64,  15, 112,  67,   0,  80,  21,
        81,  53,  86,  23,  16,   9,   9,  59,  48,  80,  55, 105,  23,
        66, 120,  99,  54,  19,  87,  31, 114,  27,  41,  82,  22,  50,
        98,  76,  60,  12,  61, 120,  64,  35,  65,   0,  93,  39,  57,
        45,  55, 101,  41,  47,  37,  37,  29,  70,  75, 118,  68, 101,
       117,  76, 109,  14,  27,   6,  56, 102,  90,  18,  99,  34,  79,
        76,  72,   4,  23,  42,  12,  53,   0,  46,  29,  19,  80,  37,
        73,   0,   8,  43,  44,  19,  83,  68,  94,  39, 114, 109, 105,
        47,  17,  74,  62,  47,  23,  41,  91,  44,   7, 111, 116,  77,
       100,   2,  30,  72,  77,  38,  47,  85,  63, 120,  97, 102, 102,
        89,  30,  32,   2,  64, 105,  64,  68,   2,  26, 110,  87,  78,
        11,  37,  26,  17,  45,  77,   3,  64,  49,  33,  51,  6

## Build tfidfvectorizer + NB Pipeline

In [13]:
# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [244]:
# Pipeline vectorizer + Naive Bayes
pipeline_nb = make_pipeline(
    TfidfVectorizer(), 
    MultinomialNB()
)

In [239]:
# Cross-validation
cv_results = cross_validate(pipeline_nb, X, y, cv = 5)
average_recall = cv_results["test_score"].mean()
np.round(average_recall,2)



0.64

## GridSearch for NB Pipeline

Do train_test_split for usual X and y - rest will be done from the pipeline

In [240]:
from sklearn.model_selection import GridSearchCV

# Define the grid of parameters
parameters = {
    'tfidfvectorizer__ngram_range': ((1,1), (2,2), (3, 3), (4, 4), (5, 5), (1, 2), (1, 3), (1, 4), (1, 5)),
    'multinomialnb__alpha': (0.1,1)
}

# Perform Grid Search
grid_search = GridSearchCV(
    pipeline_naive_bayes,
    parameters,
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 18 candidates, totalling 90 fits




Best Score = 0.9187793427230048
Best params = {'multinomialnb__alpha': 0.1, 'tfidfvectorizer__ngram_range': (4, 4)}


In [245]:
pipeline_nb.get_params()

{'memory': None,
 'steps': [('tfidfvectorizer', TfidfVectorizer()),
  ('multinomialnb', MultinomialNB())],
 'verbose': False,
 'tfidfvectorizer': TfidfVectorizer(),
 'multinomialnb': MultinomialNB(),
 'tfidfvectorizer__analyzer': 'word',
 'tfidfvectorizer__binary': False,
 'tfidfvectorizer__decode_error': 'strict',
 'tfidfvectorizer__dtype': numpy.float64,
 'tfidfvectorizer__encoding': 'utf-8',
 'tfidfvectorizer__input': 'content',
 'tfidfvectorizer__lowercase': True,
 'tfidfvectorizer__max_df': 1.0,
 'tfidfvectorizer__max_features': None,
 'tfidfvectorizer__min_df': 1,
 'tfidfvectorizer__ngram_range': (1, 1),
 'tfidfvectorizer__norm': 'l2',
 'tfidfvectorizer__preprocessor': None,
 'tfidfvectorizer__smooth_idf': True,
 'tfidfvectorizer__stop_words': None,
 'tfidfvectorizer__strip_accents': None,
 'tfidfvectorizer__sublinear_tf': False,
 'tfidfvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidfvectorizer__tokenizer': None,
 'tfidfvectorizer__use_idf': True,
 'tfidfvectorizer__vocab

In [246]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# create params
# log-uniform: understand as search over p = exp(x) by varying x
bs_opt = BayesSearchCV(
    pipeline_nb,
     {
         'multinomialnb__alpha': Real(0.01, 1, prior='log-uniform'),
         'tfidfvectorizer__min_df': Integer(low=0, high=150, prior='uniform'),
         'tfidfvectorizer__max_df': Real(low=0.2, high=0.35, prior='uniform'),
         #'tfidfvectorizer__ngram_range':  Categorical([(1,1), (1,2)])
         #'tfidfvectorizer__ngram_range': Categorical([(1,1), (1,2), (1,3), (1,4), (1,5),(2, 2), (3,3), (4,4), (5,5)])
     },
     n_iter=32,
     random_state=0
 )

In [247]:
# Execute Bayesian OPtimization
res = bs_opt.fit(X_train, y_train)




In [248]:
# Best score
print(f"Best Score = {bs_opt.best_score_}")

# Best params
print(f"Best params = {bs_opt.best_params_}")

Best Score = 0.7484860048034836
Best params = OrderedDict([('multinomialnb__alpha', 0.01), ('tfidfvectorizer__max_df', 0.35), ('tfidfvectorizer__min_df', 11)])


In [249]:
bs_opt.best_params_["tfidfvectorizer__max_df"]

0.35

In [250]:
bs_opt_tuned = bs_opt.best_estimator_

In [251]:
# Obtain result
print(bs_opt.score(X_test, y_test))

0.7605633802816901


In [252]:
# Perform Grid Search
grid_search = GridSearchCV(
    pipeline_nb,
    {
    'tfidfvectorizer__ngram_range': [(2,2), (3, 3), (4, 4), (5, 5), (1, 2), (1, 3), (1, 4), (1, 5)],
    'multinomialnb__alpha': [bs_opt.best_params_["multinomialnb__alpha"]],
    'tfidfvectorizer__min_df': [bs_opt.best_params_["tfidfvectorizer__min_df"]],
    'tfidfvectorizer__max_df': [bs_opt.best_params_["tfidfvectorizer__max_df"]],
    },
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_test, y_test)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits




Best Score = 0.8200541338582678
Best params = {'multinomialnb__alpha': 0.01, 'tfidfvectorizer__max_df': 0.35, 'tfidfvectorizer__min_df': 11, 'tfidfvectorizer__ngram_range': (1, 4)}


In [253]:
print(grid_search.score(X_test, y_test))

0.9953051643192489


In [254]:
bs_opt_tuned = grid_search.best_estimator_

# Linear SVC

In [255]:
# Pipeline vectorizer + linear SVC
pipeline_svc = make_pipeline(
    TfidfVectorizer(), 
    LinearSVC()
)

pipeline_svc.get_params()

{'memory': None,
 'steps': [('tfidfvectorizer', TfidfVectorizer()), ('linearsvc', LinearSVC())],
 'verbose': False,
 'tfidfvectorizer': TfidfVectorizer(),
 'linearsvc': LinearSVC(),
 'tfidfvectorizer__analyzer': 'word',
 'tfidfvectorizer__binary': False,
 'tfidfvectorizer__decode_error': 'strict',
 'tfidfvectorizer__dtype': numpy.float64,
 'tfidfvectorizer__encoding': 'utf-8',
 'tfidfvectorizer__input': 'content',
 'tfidfvectorizer__lowercase': True,
 'tfidfvectorizer__max_df': 1.0,
 'tfidfvectorizer__max_features': None,
 'tfidfvectorizer__min_df': 1,
 'tfidfvectorizer__ngram_range': (1, 1),
 'tfidfvectorizer__norm': 'l2',
 'tfidfvectorizer__preprocessor': None,
 'tfidfvectorizer__smooth_idf': True,
 'tfidfvectorizer__stop_words': None,
 'tfidfvectorizer__strip_accents': None,
 'tfidfvectorizer__sublinear_tf': False,
 'tfidfvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidfvectorizer__tokenizer': None,
 'tfidfvectorizer__use_idf': True,
 'tfidfvectorizer__vocabulary': None,
 'li

In [256]:
svc_params = {
 'linearsvc__C': Real(low=0.001, high=10, prior='log-uniform', transform='identity'),
 'tfidfvectorizer__min_df': Integer(low=0, high=150, prior='uniform'),
 'tfidfvectorizer__max_df': Real(low=0.2, high=0.35, prior='uniform'),
}

bs_opt = BayesSearchCV(
    pipeline_svc,
     svc_params,
     n_iter=32,
     random_state=0
 )

In [257]:
# Execute Bayesian OPtimization
res = bs_opt.fit(X_train, y_train)



In [258]:
# Best score
print(f"Best Score = {bs_opt.best_score_}")

# Best params
print(f"Best params = {bs_opt.best_params_}")

Best Score = 0.8128773764898656
Best params = OrderedDict([('linearsvc__C', 0.9074587572376299), ('tfidfvectorizer__max_df', 0.35), ('tfidfvectorizer__min_df', 0)])


In [259]:
# Perform Grid Search
grid_search = GridSearchCV(
    pipeline_svc,
    {
    'tfidfvectorizer__ngram_range': [(2,2), (3, 3), (4, 4), (5, 5), (1, 2), (1, 3), (1, 4), (1, 5)],
    'linearsvc__C': [bs_opt.best_params_["linearsvc__C"]],
    'tfidfvectorizer__min_df': [bs_opt.best_params_["tfidfvectorizer__min_df"]],
    'tfidfvectorizer__max_df': [bs_opt.best_params_["tfidfvectorizer__max_df"]],
    },
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits




Best Score = 0.9456734977890507
Best params = {'linearsvc__C': 0.9074587572376299, 'tfidfvectorizer__max_df': 0.35, 'tfidfvectorizer__min_df': 0, 'tfidfvectorizer__ngram_range': (4, 4)}


In [260]:
print(grid_search.score(X_test, y_test))

0.94679186228482


# XGBoost
Currently not working, needs tooooo much time

In [16]:
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size = 0.5, random_state = 42
)

# Pipeline vectorizer + XGBoots
pipeline_xgb = make_pipeline(
    TfidfVectorizer(), 
    XGBClassifier(n_jobs=-1, 
                  random_state=42,
                  max_depth=10, 
                  n_estimators=100, 
                  learning_rate=0.1, 
                  early_stopping_rounds=5, 
                  eval_set=[(X_train, y_train), (X_val, y_val)]
                  )
    )

In [17]:
pipeline_xgb.fit(X_train, y_train,
    # evaluate loss at each iteration
    # stop iterating when eval loss increases 5 times in a row
)

xgb_y_pred = xgb_reg.predict(X_val)
print(pipeline_xgb.score(X_test, y_test))


ValueError: Pipeline.fit does not accept the eval_set parameter. You can pass parameters to specific steps of your pipeline using the stepname__parameter format, e.g. `Pipeline.fit(X, y, logisticregression__sample_weight=sample_weight)`.

In [276]:
grid_search = GridSearchCV(
    pipeline_xgb,
    {
    'tfidfvectorizer__ngram_range': [(3, 3), (4, 4), (5, 5), (1, 3), (1, 4), (1, 5)],
    },
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

bs_opt = BayesSearchCV(
    {
  'xgbclassifier__colsample_bytree': Real(low=0.1, high=1, prior='uniform', transform='identity'),
  'xgbclassifier__gamma': Integer(low=1, high=10, prior='uniform', transform='identity'),
  'xgbclassifier__learning_rate': Real(low=0.01, high=1, prior='log-uniform', transform='identity'),
  'xgbclassifier__max_depth': Integer(low=1, high=10, prior='uniform', transform='identity'),
  'xgbclassifier__n_estimators': Integer(low=100, high=200, prior='uniform', transform='identity'),
  'xgbclassifier__reg_alpha': Real(low=0.001, high=10, prior='log-uniform', transform='identity'),
  'xgbclassifier__reg_lambda': Real(low=0.001, high=10, prior='log-uniform', transform='identity'),
  'xgbclassifier__subsample': Real(low=0.3, high=0.8, prior='uniform', transform='identity'),
  'tfidfvectorizer__min_df': Integer(low=5, high=150, prior='uniform'),
  'tfidfvectorizer__max_df': Real(low=0.2, high=0.35, prior='uniform'),
  'tfidfvectorizer__ngram_range': [grid_search.best_params_["tfidfvectorizer__ngram_range"]]
},
     svc_params,
     n_iter=32,
     random_state=0,
     n_jobs=-1,
     verbose=1
 )

bs_opt.fit(X_train, y_train)

# Best score
print(f"Best Score = {bs_opt.best_score_}")

# Best params
print(f"Best params = {bs_opt.best_params_}")

print(bs_opt.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


6 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/timcerta/.pyenv/versions/3.8.12/envs/xref/lib/python3.8/site-packages/xgboost/core.py", line 575, in inner_f
    return f(**kwargs)
  File "/Users/timcerta/.pyenv/versions/3.8.12/envs/

TypeError: estimator should be an estimator implementing 'fit' method, {'xgbclassifier__colsample_bytree': Real(low=0.1, high=1, prior='uniform', transform='identity'), 'xgbclassifier__gamma': Integer(low=1, high=10, prior='uniform', transform='identity'), 'xgbclassifier__learning_rate': Real(low=0.01, high=1, prior='log-uniform', transform='identity'), 'xgbclassifier__max_depth': Integer(low=1, high=10, prior='uniform', transform='identity'), 'xgbclassifier__n_estimators': Integer(low=100, high=200, prior='uniform', transform='identity'), 'xgbclassifier__reg_alpha': Real(low=0.001, high=10, prior='log-uniform', transform='identity'), 'xgbclassifier__reg_lambda': Real(low=0.001, high=10, prior='log-uniform', transform='identity'), 'xgbclassifier__subsample': Real(low=0.3, high=0.8, prior='uniform', transform='identity'), 'tfidfvectorizer__min_df': Integer(low=5, high=150, prior='uniform', transform='identity'), 'tfidfvectorizer__max_df': Real(low=0.2, high=0.35, prior='uniform', transform='identity'), 'tfidfvectorizer__ngram_range': [(3, 3)]} was passed