In [59]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:
splits = [10,20,30]
num_classes = len(splits)+1
from dataset import YouTubeDataset
dataset = YouTubeDataset(splits)

In [113]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile

from inference import eval, get_scores
from torch.nn.functional import cross_entropy
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

def train_model(dataset, train_val_split = None):
    if not train_val_split:
      train_ids = [i for i in range(len(dataset))]
      val_ids = None
    else:
      train_ids, val_ids = train_val_split
    
    X_train, y_train = [dataset.text[i] for i in train_ids], dataset.label[train_ids].numpy()
    X_val, y_val = [dataset.text[i] for i in val_ids], dataset.label[val_ids].numpy()

    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2)
    X_train = vectorizer.fit_transform(X_train).toarray()
    X_val = vectorizer.transform(X_val)
    
    # Fit the model
    base_clf=LinearSVC(penalty='l2', loss = 'squared_hinge', dual=False)
    model = OneVsRestClassifier(base_clf)
    model.fit(X_train, y_train)
    # Compute the R^2 score on the training and testing set
    y_pred = model.predict(X_train)
    y_pred_val = model.predict(X_val)

    train_accuracy, train_precision, train_recall, train_f1 = get_scores(y_train, y_pred, num_classes) # This is an aggregated result due to GPU size limit
    print(f"    Training Set - accuracy: {train_accuracy:.2f}, precision: {train_precision:.2f}, recall: {train_recall:.2f}, f1-score: {train_f1:.2f},")
    val_accuracy, val_precision, val_recall, val_f1 = get_scores(y_val, y_pred_val, num_classes)
    print(f"    Validation Set - accuracy: {val_accuracy:.2f}, precision: {val_precision:.2f}, recall: {val_recall:.2f}, f1-score: {val_f1:.2f},")
    return val_accuracy, val_precision, val_recall, val_f1

In [107]:
from sklearn.model_selection import KFold
import numpy as np

def train_model_cv5(dataset):
    kf = KFold(n_splits=5)
    cnt = 1
    val_accuracy_ls, val_precision_ls, val_recall_ls, val_f1_ls = [], [], [], []
    for train_index, val_index in kf.split(dataset):
        print("Fold "+str(cnt)+" (val", val_index[0],"-",str(val_index[-1])+")")
        val_accuracy, val_precision, val_recall, val_f1 = train_model(dataset=dataset, train_val_split=(train_index, val_index))
        val_accuracy_ls.append(val_accuracy)
        val_precision_ls.append(val_precision)
        val_recall_ls.append(val_recall)
        val_f1_ls.append(val_f1)
        cnt += 1

    print(f"{np.array(val_accuracy_ls).mean(): .3f} {np.array(val_precision_ls).mean(): .3f} {np.array(val_recall_ls).mean(): .3f} {np.array(val_f1_ls).mean():.3f}")

In [108]:
train_model_cv5(dataset)

Fold 1 (val 0 - 42)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.79, precision: 0.32, recall: 0.32, f1-score: 0.32,
Fold 2 (val 43 - 84)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.67, precision: 0.29, recall: 0.31, f1-score: 0.30,
Fold 3 (val 85 - 126)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.40, precision: 0.21, recall: 0.27, f1-score: 0.20,
Fold 4 (val 127 - 168)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.50, precision: 0.25, recall: 0.31, f1-score: 0.26,
Fold 5 (val 169 - 210)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.48, precision: 0.26, recall: 0.30, f1-score: 0.25,
 0.568  0.266  0.299 0.267


In [114]:
splits = [10,20]
num_classes = len(splits)+1
from dataset import YouTubeDataset
dataset = YouTubeDataset(splits)
train_model_cv5(dataset)
# 0.577  0.466  0.439 0.405

Fold 1 (val 0 - 42)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.81, precision: 0.60, recall: 0.60, f1-score: 0.60,
Fold 2 (val 43 - 84)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.67, precision: 0.39, recall: 0.41, f1-score: 0.40,
Fold 3 (val 85 - 126)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.43, precision: 0.63, recall: 0.39, f1-score: 0.33,
Fold 4 (val 127 - 168)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.50, precision: 0.33, recall: 0.41, f1-score: 0.35,
Fold 5 (val 169 - 210)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.48, precision: 0.38, recall: 0.39, f1-score: 0.35,
 0.577  0.466  0.439 0.405


In [115]:
splits = [10]
num_classes = len(splits)+1
from dataset import YouTubeDataset
dataset = YouTubeDataset(splits)
train_model_cv5(dataset)
# 0.577  0.466  0.439 0.405

Fold 1 (val 0 - 42)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.72, precision: 0.60, recall: 0.66, f1-score: 0.61,
Fold 2 (val 43 - 84)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.64, precision: 0.57, recall: 0.57, f1-score: 0.57,
Fold 3 (val 85 - 126)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.43, precision: 0.44, recall: 0.44, f1-score: 0.43,
Fold 4 (val 127 - 168)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.57, precision: 0.62, recall: 0.59, f1-score: 0.56,
Fold 5 (val 169 - 210)
    Training Set - accuracy: 1.00, precision: 1.00, recall: 1.00, f1-score: 1.00,
    Validation Set - accuracy: 0.62, precision: 0.61, recall: 0.62, f1-score: 0.61,
 0.597  0.572  0.578 0.555
