# 1. Imports

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score
from tpot import TPOTClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from jcopml.tuning import bayes_search_params as bsp
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import xgboost as xgb
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# 2. Load and preprocess Data

In [12]:
### read data
data = pd.read_csv("/Users/timcerta/code/jbaccarin/xref/raw_data/gcj2008.csv")
# Remove NAs
data = data.dropna()
# Remove code with less than x characters
data = data.loc[data['flines'].str.len() > 5]
# Remove users with entries < 25
data["username"].value_counts()
data = data[data['username'].map(data['username'].value_counts()) > 25].reset_index(drop = True)
# when there are more than 1 submissions, keep only the last one
data = data.drop_duplicates(subset=['year', 'round', 'username', 'task'], keep='first')

In [13]:
target_encoder = LabelEncoder().fit(data['username']) 
y = target_encoder.transform(data['username'])

X = data["flines"]

# 3. Train-test-split

In [14]:
# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# 4. Build Pipeline

In [15]:
# Pipeline vectorizer + linear SVC
pipeline_svc = make_pipeline(
    TfidfVectorizer(), 
    LinearSVC()
)

pipeline_svc.get_params()

# 5. Cross-validate

In [16]:
# Cross-validation
cv_results = cross_validate(pipeline_svc, X_test, y_test, cv = 5)
average_score = cv_results["test_score"].mean()
np.round(average_score,2)



0.27

# 6. Build untuned model

In [17]:
pipeline_svc.fit(X_train,y_train)
res = pipeline_svc.score(X_test,y_test)
res

0.5352112676056338

# 7. Tune model

In [18]:
pipeline_svc.get_params()

{'memory': None,
 'steps': [('tfidfvectorizer', TfidfVectorizer()),
  ('multinomialnb', MultinomialNB())],
 'verbose': False,
 'tfidfvectorizer': TfidfVectorizer(),
 'multinomialnb': MultinomialNB(),
 'tfidfvectorizer__analyzer': 'word',
 'tfidfvectorizer__binary': False,
 'tfidfvectorizer__decode_error': 'strict',
 'tfidfvectorizer__dtype': numpy.float64,
 'tfidfvectorizer__encoding': 'utf-8',
 'tfidfvectorizer__input': 'content',
 'tfidfvectorizer__lowercase': True,
 'tfidfvectorizer__max_df': 1.0,
 'tfidfvectorizer__max_features': None,
 'tfidfvectorizer__min_df': 1,
 'tfidfvectorizer__ngram_range': (1, 1),
 'tfidfvectorizer__norm': 'l2',
 'tfidfvectorizer__preprocessor': None,
 'tfidfvectorizer__smooth_idf': True,
 'tfidfvectorizer__stop_words': None,
 'tfidfvectorizer__strip_accents': None,
 'tfidfvectorizer__sublinear_tf': False,
 'tfidfvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidfvectorizer__tokenizer': None,
 'tfidfvectorizer__use_idf': True,
 'tfidfvectorizer__vocab

## 7.1 do bayesian optimization on every param except ngram_range

In [19]:
# create params
# log-uniform: understand as search over p = exp(x) by varying x
bs_opt = BayesSearchCV(
    pipeline_svc,
     {
         'multinomialnb__alpha': Real(0.01, 1, prior='log-uniform'),
         'tfidfvectorizer__min_df': Integer(low=0, high=150, prior='uniform'),
         'tfidfvectorizer__max_df': Real(low=0.2, high=0.35, prior='uniform'),
         #'tfidfvectorizer__ngram_range':  Categorical([(1,1), (1,2)])
         #'tfidfvectorizer__ngram_range': Categorical([(1,1), (1,2), (1,3), (1,4), (1,5),(2, 2), (3,3), (4,4), (5,5)])
     },
     n_iter=32,
     random_state=0
 )

In [20]:
# Execute Bayesian OPtimization
res = bs_opt.fit(X_train, y_train)
res



In [21]:
# Perform Grid Search
grid_search = GridSearchCV(
    pipeline_svc,
    {
    'tfidfvectorizer__ngram_range': [(2,2), (3, 3), (4, 4), (5, 5), (1, 2), (1, 3), (1, 4), (1, 5)],
    'linearsvc__alpha': [bs_opt.best_params_["multinomialnb__alpha"]],
    'tfidfvectorizer__min_df': [bs_opt.best_params_["tfidfvectorizer__min_df"]],
    'tfidfvectorizer__max_df': [bs_opt.best_params_["tfidfvectorizer__max_df"]],
    },
    cv = 5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

NameError: name 'GridSearchCV' is not defined

## Result of tuned NB model

In [None]:
print(grid_search.score(X_test, y_test))

In [None]:
bs_opt_tuned = grid_search.best_estimator_

# 8. Confusion Matrix

In [None]:
#X, y = make_classification(random_state=0)
#X_train, X_test, y_train, y_test = train_test_split(
#X, y, random_state=0)
#clf = SVC(random_state=0)
#clf.fit(X_train, y_train)
#SVC(random_state=0)
plot_confusion_matrix(bs_opt_tuned, X_test, y_test)
plt.show()