# A Handbuilt Soft-SVM 

## Creating the soft-SVM classifier

In [7]:
import numpy as np
import pandas as pd
from collections import defaultdict

def classify(train_words, train_labels, test_words):
    """
    Classify the given test words into 'spanish' or 'french' based on the training data.

    Parameters:
    train_words (list of str): List of words in the training set.
    train_labels (list of str): List of labels ('spanish' or 'french') for the training set.
    test_words (list of str): List of words in the test set to classify.

    Returns:
    list of str: Predicted labels ('spanish' or 'french') for the test set.
    """
    data = pd.DataFrame({"word": train_words, "label": train_labels})
    data['onehot_spanish'] = data['label'].apply(lambda x: 1 if x == 'spanish' else -1)

    class NGramTokenizer:
        """
        Tokenizer to generate n-grams from words and create a vocabulary of n-grams.
        """
        def __init__(self, ngram_range):
            """
            Initialize the NGramTokenizer with the specified range of n-grams.

            Parameters:
            ngram_range (tuple): Tuple specifying the lower and upper bounds for n-grams.
            """
            self.ngram_range = ngram_range
            self.vocab = defaultdict(int)

        def generate_ngrams(self, word):
            """
            Generate n-grams for the given word based on the specified n-gram range.

            Parameters:
            word (str): The word to generate n-grams from.

            Returns:
            list of str: List of generated n-grams.
            """
            ngrams = []
            lower, upper = self.ngram_range
            for n in range(lower, upper + 1):
                for i in range(len(word) - n + 1):
                    ngrams.append(word[i:i + n])
            return ngrams

        def fit(self, words):
            """
            Fit the tokenizer to the given list of words and create the n-gram vocabulary.

            Parameters:
            words (list of str): List of words to fit the tokenizer on.

            Returns:
            numpy.ndarray: Matrix of shape (number of words, size of vocabulary) representing
                           the presence of n-grams in each word.
            """
            index = 0
            for word in words:
                ngrams = self.generate_ngrams(word)
                for ngram in ngrams:
                    if ngram not in self.vocab:
                        self.vocab[ngram] = index
                        index += 1

            X = np.zeros((len(words), len(self.vocab)))
            for i, word in enumerate(words):
                ngrams = self.generate_ngrams(word)
                for ngram in ngrams:
                    if ngram in self.vocab:
                        X[i, self.vocab[ngram]] = 1

            return X
    
    ngram = NGramTokenizer(ngram_range=(2, 5))
    X_train = ngram.fit(data['word'])
    pairs = list(ngram.vocab.keys())
    
    y_train = data['onehot_spanish'].to_numpy()

    # Soft-SVM using Subgradient Descent
    alpha = 0.01
    C = 1  # Regularization parameter
    lambda_ = 0.01  # Regularization parameter
    w = np.zeros(len(X_train[0]))

    for n in range(1000):
        for i in range(len(X_train)):
            if y_train[i] * np.dot(X_train[i], w) < 1:
                w = w - alpha * (2 * w / len(X_train) - C * y_train[i] * X_train[i])
            else:
                subgradient = 2 * w / len(X_train) - lambda_ * w
                if y_train[i] * np.dot(X_train[i], w) == 1:
                    subgradient += -C * y_train[i] * X_train[i]
                w = w - alpha * subgradient

    def create_pattern_matrix(words, patterns):
        """
        Create a pattern matrix for the given words and patterns.

        Parameters:
        words (list of str): List of words to create the pattern matrix for.
        patterns (list of str): List of patterns (n-grams) to look for in the words.

        Returns:
        numpy.ndarray: Matrix of shape (number of words, number of patterns) representing
                       the presence of patterns in each word.
        """
        pattern_matrix = np.zeros((len(words), len(patterns)), dtype=float)
        for i, word in enumerate(words):
            for j, pattern in enumerate(patterns):
                if pattern in word:
                    pattern_matrix[i, j] = 1
        return pattern_matrix

    X_test = create_pattern_matrix(test_words, pairs)

    # Predict the labels for the test set
    predictions = []
    for i in range(len(X_test)):
        raw_prediction = np.dot(X_test[i], w)
        predicted_label = 'spanish' if raw_prediction >= 0 else 'french'
        predictions.append(predicted_label)
    
    return predictions


## Example usage of the SVM

In [8]:
train = pd.read_csv('train.csv')

new_words = [
    "amigo", "bonjour", "casa", "ecole", "libro", "chien", "coche", "fromage", 
    "mesa", "jardin", "mar", "soleil", "rapido", "fleur", "familia", "ville", 
    "musica", "lait", "comida", "rouge", "playa", "neige", "salud", "ami", 
    "verde", "livre", "gato", "pomme", "escuela", "eau"
]



testing = classify(train['word'], train['label'], new_words)  

new_labels = [
    "spanish", "french", "spanish", "french", "spanish", "french", "spanish", "french", 
    "spanish", "french", "spanish", "french", "spanish", "french", "spanish", "french", 
    "spanish", "french", "spanish", "french", "spanish", "french", "spanish", "french", 
    "spanish", "french", "spanish", "french", "spanish", "french"
]

assert len(testing) == len(new_labels)

correct_predictions = 0
total_predictions = len(testing)
for predicted_label, actual_label in zip(testing, new_labels):
    if predicted_label == actual_label:
        correct_predictions += 1

accuracy = correct_predictions / total_predictions

print("Accuracy:", accuracy)

Accuracy: 0.9333333333333333
