# Unsupervised learning (one-class SVM)

In [1]:
# Load libraries
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
# Load data
df_2003 = pd.read_csv('data/2003/clean_labeled_zero_2003.csv')
df_sub_2003 = df_2003[['BC', 'ID', 'OD', 'CC', 'DC', 'Type']]

In [3]:
def ocsvm(df, train_label):
    test_labels = df.Type.unique()
    df_train = df[df['Type'] == train_label].drop(columns=['Type'])
    df_train_target = np.full(len(df_train), 1)
    
    train_size = [10, 20, 30, 40, 50, 100, 300, 500]
    nu = [0.1, 0.05, 0.01, 0.005, 0.001]
    gamma = np.arange(0.1, 2, 0.1)

    training_label = []
    test_label = []
    percent_inliers = []
    gamma_value = []
    nu_value = []
    size = []

    for i in train_size:
        xtrain, xtest, ytrain, ytest = train_test_split(df_train, df_train_target, train_size=i, random_state=42)
        for j in nu:
            for k in gamma:
                clf = OneClassSVM(kernel='rbf', gamma=k, nu=j, shrinking=True).fit(xtrain)
                for l in test_labels:
                    test = df[df['Type'] == l].drop(columns=['Type'])
                    if l == train_label:
                        test_target = np.full(len(test), 1)
                    else:
                        test_target = np.full(len(test), -1)
                    pred = clf.predict(test)
                    dic_count = Counter(pred)
                    positive_samples = dic_count[1]
                    percent_inliers.append(np.round(np.divide(100 * positive_samples, test.shape[0]), decimals=1))
                    test_label.append(l)
                    training_label.append(train_label)
                    gamma_value.append(k)
                    nu_value.append(j)
                    size.append(i)
    
    data = {'train_label': training_label,
            'test_label': test_label,
            'percentage_inliers': percent_inliers,
            'gamma': gamma_value,
            'nu': nu_value,
            'train_size': size}
    
    final_results = pd.DataFrame(data)

    return final_results

In [4]:
labels = [1,2,3]

final_results = pd.DataFrame()

for i in labels:
    results = ocsvm(df_sub_2003, i)
    final_results = pd.concat([final_results, results])

In [5]:
final_results

Unnamed: 0,train_label,test_label,percentage_inliers,gamma,nu,train_size
0,1,0,0.0,0.1,0.100,10
1,1,1,93.2,0.1,0.100,10
2,1,2,83.6,0.1,0.100,10
3,1,3,0.0,0.1,0.100,10
4,1,0,0.0,0.2,0.100,10
...,...,...,...,...,...,...
3035,3,3,93.9,1.8,0.001,500
3036,3,0,0.0,1.9,0.001,500
3037,3,1,0.0,1.9,0.001,500
3038,3,2,1.6,1.9,0.001,500


In [21]:
final_results.to_csv("data/ocsvm_results.csv", index=False)