In [1]:
import itertools

import polars as pl

from sklearn.neighbors import KNeighborsClassifier

# 1. Modeling

## 1.1 TF-IDF

In [2]:
# reading data [TF-IDF]
train_tfidf = pl.read_csv('output/tf_idf/train.csv').to_pandas()
test_tfidf = pl.read_csv('output/tf_idf/test.csv').to_pandas()
train_tfidf.shape, test_tfidf.shape

((11413, 28651), (4024, 28651))

### 1.1.1 KNN

In [3]:
import time

from collections import defaultdict

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

In [30]:
# defining estimator
knn = KNeighborsClassifier()

# defining parameters grid
knn_params = {
    'n_neighbors': range(3, 5),
    'metric': ('cosine', 'euclidean'),
    'weights': ('uniform', 'distance'),
    'metrics': ('cosine', 'euclidean', 'manhattan'),
}

In [31]:
knn_params = list(itertools.product(*[knn_params[k] for k in knn_params]))

In [22]:
def f1(y_true, y_pred, results):
    """..."""
    metrics = [('f1', None), ('f1_micro', 'micro'), ('f1_macro', 'macro')]
    for k, f in metrics:
        score = f1_score(y_true, y_pred, average=f)
        results[k].append(score)
    return results

In [23]:
def kfold(estimator, X, y, cv):

    results = defaultdict(list)

    for i, j in cv.split(X):

        # spliting data
        X_train, y_train = X[i,:], y[i]
        X_test, y_test = X[j,:], y[j]

        # training and predictions
        start = time.time()
        estimator.fit(X_train, y_train)
        pred = estimator.predict(X_test)
        end = time.time()

        # validating
        results['exec_time'] = end-start
        results = f1(y_test, pred, results)

    return results, exec_time

In [44]:
kf = KFold(n_splits=5, shuffle=True)
a = kfold(
    KNeighborsClassifier(n_neighbors=3, metric='euclidean'),
    train_tfidf.iloc[:, 2:].values,
    train_tfidf.iloc[:, 0].values,
    kf)

In [45]:
a

(defaultdict(list,
             {'f1': [array([0.76220472, 0.57142857, 0.06896552, 0.14634146, 0.29411765,
                      0.        , 0.63157895, 0.5       , 0.62745098, 0.66666667,
                      0.14285714, 0.19047619, 0.        , 0.45      , 0.        ,
                      0.62295082, 0.        , 0.19672131, 0.        , 0.91545687,
                      0.66666667, 0.22222222, 0.31111111, 0.59574468, 0.18181818,
                      0.        , 1.        , 0.28571429, 1.        , 0.        ,
                      0.66666667, 0.43939394, 0.66666667, 0.33333333, 0.        ,
                      0.66666667, 0.        , 0.        , 0.        , 0.14814815,
                      0.        , 0.33333333, 0.45522388, 0.70833333, 0.        ,
                      0.11764706, 0.        , 0.        , 0.08888889, 0.4       ,
                      0.28571429, 0.        , 0.        , 0.        , 0.        ,
                      0.55555556, 0.5       , 0.10526316, 0.8       , 0. 

In [None]:
f1_scores = [('f1', None), ('f1_micro', 'micro'), ('f1_macro', 'macro')]

kf = KFold(n_splits=10, shuffle=True)

for k, m in knn_params:

    knn = KNeighborsClassifier(n_neighbors=k, metric=m)
    
    results = {'f1': [], 'f1_macro': [], 'f1_micro': []}

    for train, test in kf.split(train_tfidf):

        start = time.time()
        
        X_train, y_train = train_tfidf.iloc[train, 2:].values, train_tfidf.iloc[train, 0].values
        X_test, y_test = train_tfidf.iloc[test, 2:].values, train_tfidf.iloc[test, 0].values

        knn.fit(X_train, y_train)
        
        end = time.time()
        
        pred = knn.predict(X_test)
        
        print(f'tempo de execução = {end - start}')

        for k, f in f1_scores:
            score = f1_score(y_test, pred, average=f)
            results[k].append(score)

    print('_' * 80)
    break