# 4. Support Vector Machines (SVMs)

### Preprocesarea datelor (standardizare)

In [15]:
from sklearn import preprocessing, svm
from sklearn.metrics import f1_score
import numpy as np
import os

In [10]:
x_train = np.array([[1, -1, 2], [2, 0, 0], [0, 1, -1]], dtype=np.float64)
x_test = np.array([[-1, 1, 0]], dtype=np.float64)
scaler = preprocessing.StandardScaler()
scaler.fit(x_train)
print(scaler.mean_, end='\n\n') 
print(scaler.scale_, end='\n\n') 

scaled_x_train = scaler.transform(x_train)
print(scaled_x_train, end='\n\n') 

scaled_x_test = scaler.transform(x_test)
print(scaled_x_test, end='\n\n')

[1.         0.         0.33333333]

[0.81649658 0.81649658 1.24721913]

[[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]

[[-2.44948974  1.22474487 -0.26726124]]


### Bag of Words (BoW) & SVM

In [5]:
train_sentences = np.load(os.path.join('data', 'training_sentences.npy'), allow_pickle=True)
train_labels = np.load(os.path.join('data', 'training_labels.npy'))
test_sentences = np.load(os.path.join('data', 'test_sentences.npy'), allow_pickle=True)
test_labels = np.load(os.path.join('data', 'test_labels.npy'))

In [2]:
def normalize_data(train_data, test_data, type=None):
    
    scaled_train = train_data
    scaled_test = test_data
    
    if type == 'standard':
        scaler = preprocessing.StandardScaler()
        scaler.fit(train_data)
        
        scaled_train = scaler.transform(train_data)
        scaled_test = scaler.transform(test_data)
        
    elif type == 'l1':
        scaled_train = train_data / np.expand_dims(np.sum(abs(train_data), axis=1), axis=1)
        scaled_test = test_data / np.expand_dims(np.sum(abs(test_data), axis=1), axis=1)
        
    elif type == 'l2':
        scaled_train = train_data / np.expand_dims(np.sqrt(np.sum(train_data ** 2, axis=1)), axis=1)
        scaled_test = test_data / np.expand_dims(np.sqrt(np.sum(test_data ** 2, axis=1)), axis=1)
        
    return scaled_train, scaled_test

In [9]:
class BagOfWords:
    
    def __init__(self):
        self.vocabulary = {}
        self.vocab_size = 0
        self.word_list = []
        
    def build_vocabulary(self, data):
        for sample in data:
            for word in sample:
                if word not in self.vocabulary.keys():
                    self.vocabulary[word] = self.vocab_size
                    self.vocab_size += 1
                    self.word_list.append(word)
                    
    def get_features(self, data):
        
        num_samples = np.shape(data)[0]
        features = np.zeros(shape=(num_samples, self.vocab_size))
        
        for sample_id, sample in enumerate(data):
            for word in sample:
                if word in self.vocabulary.keys():
                    features[sample_id][self.vocabulary[word]] += 1
                
        return features
                    

In [None]:
bow = BagOfWords()
bow.build_vocabulary(train_sentences)
x_train = bow.get_features(train_sentences)
x_test = bow.get_features(test_sentences)

normalized_x_train, normalized_x_test = normalize_data(x_train, x_test, 'l2')

In [14]:
svm_classifier = svm.SVC(C=1, kernel='linear')
svm_classifier.fit(normalized_x_train, train_labels)
predicted_labels = svm_classifier.predict(normalized_x_test)

# accuracy
print(svm_classifier.score(test_labels))

print(f1_score(np.asarray(test_labels), predicted_labels))

coefs = np.squeeze(np.asarray(svm_classifier.coef_))
indexes = np.argsort(coefs)


ValueError: Input X contains NaN.
SVC does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values