In [1]:
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor

In [2]:
df = pd.read_csv('./clf_df', sep='\t')
train_df = df.drop(df[df['type'] == 'con'].index)
test_df = df.drop(df[df['type'] == 'nat'].index)

In [3]:
scaler = StandardScaler()
X_nat = scaler.fit_transform(train_df.iloc[:,2:].values.round(decimals=3))
X_con = scaler.transform(test_df.iloc[:,2:].values.round(decimals=3))
X_nat, X_con

(array([[-0.35412459, -0.73496098, -0.55547899, -0.7576809 , -0.2455142 ],
        [-2.07457989, -0.43412244, -1.9525467 , -1.41243624, -1.13825629],
        [-0.12394361,  0.69307232,  1.10182682,  0.41046215,  1.19307592],
        [ 1.17451322, -0.67725352, -0.69244641, -0.40426182, -0.59459457],
        [-0.14755191,  0.40616317,  0.29371903, -0.16988917,  0.03870573],
        [-0.79678033, -0.49363891, -0.39111808, -0.36333961, -0.59022196],
        [ 1.46371497,  0.80360289,  1.0607366 ,  1.02801549,  1.00578227],
        [ 0.36592874, -1.53960908, -0.19936369,  2.15523633, -1.32627871],
        [ 0.49282339,  1.97674654,  1.33467144, -0.48610624,  1.6573018 ]]),
 array([[-0.56069727,  0.10224932, -0.77462687, -0.41542242, -0.4502983 ],
        [ 0.73775956, -0.93123867, -2.02103041, -1.35663322, -1.61414248],
        [ 0.26854448, -0.61176733, -2.07581738, -0.91392933, -1.16230567],
        [-7.82615344, -1.17002813, -2.55520336, -0.70187788, -2.39975552]]))

In [4]:
svm = OneClassSVM(kernel='rbf', gamma='scale', nu=0.05)
svm.fit(X_nat)

In [5]:
svm_pred = svm.predict(X_con)
print({lang: pred for lang, pred in zip(test_df['corpus'], svm_pred)})

{'eo': -1, 'lfn': -1, 'ia': -1, 'io': -1}


In [6]:
lof = LocalOutlierFactor(n_neighbors=18, algorithm='auto', novelty=True, contamination=0.1)
lof.fit(X_nat)



In [7]:
lof_pred = lof.predict(X_con)
print({lang: pred for lang, pred in zip(test_df['corpus'], lof_pred)})

{'eo': 1, 'lfn': 1, 'ia': 1, 'io': -1}
