### Imports

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Read Data

In [4]:
df = pd.read_csv('data/data3.csv', index_col=0)
df.head()

Unnamed: 0_level_0,catala,espanyol,alemany,frances,polones,portugues,rus,italia,suec
angles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
as,com,como,wie,comme,jak,Como,в качестве,come,som
his,seva,su,seine,le sien,jego,seu,его,il suo,hans
that,que,que,das,ce,że,este,что,Quello,den där
he,ell,él,er,il,on,ele,он,lui,han
was,era,estaba,war,a été,był,foi,был,era,var


### Preprocessing

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA

In [6]:
# Separate features from target
X = df.drop(['word', 'lang'], axis=1)
y = df['lang']

KeyError: "['word', 'lang'] not found in axis"

In [None]:
# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43) # try different test_size

In [None]:
# Scale Data: https://towardsdatascience.com/feature-scaling-and-normalisation-in-a-nutshell-5319af86f89b
scaler = StandardScaler() # try different scalers
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Dimensionality Reduction: https://www.mikulskibartosz.name/pca-how-to-choose-the-number-of-components/
pca = PCA(n_components=0.99, random_state=43) # try different n_components
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

### Model

In [None]:
from sklearn import svm

In [None]:
parameters = {'C': [0.1, 1, 5, 10], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'rbf'], "decision_function_shape": ["ovo", "ovr"]}
grid = GridSearchCV(svm.SVC(), parameters, refit=True, verbose=3)
grid.fit(X_train_pca, y_train)
print(grid.best_params_)
print(grid.best_estimator_)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV 1/5] END C=0.1, decision_function_shape=ovo, gamma=1, kernel=linear;, score=0.464 total time=   1.0s
[CV 2/5] END C=0.1, decision_function_shape=ovo, gamma=1, kernel=linear;, score=0.494 total time=   1.0s
[CV 3/5] END C=0.1, decision_function_shape=ovo, gamma=1, kernel=linear;, score=0.485 total time=   1.0s
[CV 4/5] END C=0.1, decision_function_shape=ovo, gamma=1, kernel=linear;, score=0.477 total time=   0.9s
[CV 5/5] END C=0.1, decision_function_shape=ovo, gamma=1, kernel=linear;, score=0.485 total time=   0.9s
[CV 1/5] END C=0.1, decision_function_shape=ovo, gamma=1, kernel=rbf;, score=0.256 total time=   2.3s
[CV 2/5] END C=0.1, decision_function_shape=ovo, gamma=1, kernel=rbf;, score=0.250 total time=   3.0s
[CV 3/5] END C=0.1, decision_function_shape=ovo, gamma=1, kernel=rbf;, score=0.249 total time=   2.4s
[CV 4/5] END C=0.1, decision_function_shape=ovo, gamma=1, kernel=rbf;, score=0.249 total time=   2.3s
[CV 5

In [None]:
# Create Models
models_pca = [
    svm.SVC(kernel='linear', probability=True, random_state=43),
    svm.SVC(kernel='poly', probability=True, random_state=43),
    svm.SVC(kernel='rbf', probability=True, random_state=43),
    svm.SVC(C=10, kernel='rbf', decision_function_shape='ovo', gamma=0.01)
]

for model_pca in models_pca:
    model_pca.fit(X_train_pca, y_train)

#### Check Results

In [None]:
for i, model_pca in enumerate(models_pca):
    score_pca = model_pca.score(X_test_pca, y_test).__round__(4) * 100
    print(f'Model accuracy: {score_pca}%')

Model accuracy: 49.34%
Model accuracy: 42.91%
Model accuracy: 49.19%
Model accuracy: 47.67%


| test_size | Scaler              | PCA  | Kernel | Score  |
|-----------|---------------------|------|--------|--------|
| 0.2       | Standard            | 0.95 | linear | 49.24% |
| 0.2       | Standard            | 0    | linear | 48.29% |
| 0.2       | MinMax              | 0.95 | linear | 45.7%  |
| 0.2       | MinMax              | 0    | linear | 48.23% |
| 0.2       | MaxAbs              | 0.95 | linear | 45.7%  |
| 0.2       | MaxAbs              | 0    | linear | 48.08% |
| 0.2       | RobustScaler        | 0.95 | linear | 47.47% |
| 0.2       | RobustScaler        | 0    | linear | 49.39% |
| 0.2       | Normalizer          | 0.95 | linear | 44.94% |
| 0.2       | Normalizer          | 0    | linear | 47.06% |
| 0.2       | QuantileTransformer | 0.95 | linear | 47.22% |
| 0.2       | QuantileTransformer | 0    | linear | 48.58% |
| 0.2       | PowerTransformer    | 0.95 | linear | 47.57% |
| 0.2       | PowerTransformer    | 0    | linear | 47.27% |

pca = 46.834
no_pca = 48.12

| test_size | Scaler              | PCA  | Kernel | Score  |
|-----------|---------------------|------|--------|--------|
| 0.2       | RobustScaler        | 0    | linear | 49.39% |
| 0.2       | Standard            | 0.95 | linear | 49.24% |
| 0.2       | QuantileTransformer | 0    | linear | 48.58% |
| 0.2       | Standard            | 0    | linear | 48.29% |
| 0.2       | MinMax              | 0    | linear | 48.23% |
| 0.2       | MaxAbs              | 0    | linear | 48.08% |
| 0.2       | PowerTransformer    | 0.95 | linear | 47.57% |
| 0.2       | RobustScaler        | 0.95 | linear | 47.47% |
| 0.2       | PowerTransformer    | 0    | linear | 47.27% |
| 0.2       | QuantileTransformer | 0.95 | linear | 47.22% |
| 0.2       | Normalizer          | 0    | linear | 47.06% |
| 0.2       | MinMax              | 0.95 | linear | 45.7%  |
| 0.2       | MaxAbs              | 0.95 | linear | 45.7%  |
| 0.2       | Normalizer          | 0.95 | linear | 44.94% |
