# Homework 4 - LING 6300

## **Karan Praharaj**

In [9]:
import numpy as np
from sklearn.svm import LinearSVC
import random
from collections import defaultdict
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import nltk
from unidecode import unidecode

For our training features, we will extract character n-grams (bigrams and trigrams) from the words in our datasets. This method of feature extraction is not linguistically rich, however, it can be argued that even something as simple as learning the patterns of character sequences can make for a decent naive model. This is just a hypothesis, and we will revisit this approach if the performance is less than satisfactory.

For example, the features generated from the word 'karan' would be:

_'kar ara ran ka ar ra an'_

We will string together these constituent patterns into a list, and treat it like a sentence of 'words'.

In [2]:
## Feature extraction

def constituents(word):
    const_list = []
    
    ## adding trigram character combinations.
    for i in range(0,len(word)-2):
        const_list.append(word[i:i+3])
    
    ## adding all bigram character combinations.
    for i in range(0,len(word)-1):
        const_list.append(word[i:i+2])

        
    const_sent = ' '.join(const_list)
    
    return const_sent

In [3]:
# This function sets up a central pandas dataframe for all our data.

data = {}

def generate_table():
    txtfiles = []
    for file in glob.glob("hw4data/*.txt"):
        file_name = file.split('/')[-1].split('.txt')[0]
        txtfiles.append(file_name)
        data[file_name] = pd.read_csv(file, sep = '\t', header = None, names = ['label', 'word'])
        
    # Lower-casing and word constituent generation.
    for i in data:
        data[i]['word'] = data[i]['word'].apply(lambda x: x.lower())
        data[i]['word_constituents'] = data[i]['word'].apply(constituents)
    
    return data

data = generate_table()

In [4]:
corpus = {}
def generate_corpora():
    for i in data:
        corpus[i] = list(data[i]['word_constituents'])
    
    return corpus

corpus = generate_corpora()

In [18]:
# Preparing and pre-processing the data for training and evaluation. (90-10 split)

X, y = {}, {}
X_train, y_train = {}, {}
X_train_cts = {}
X_test, y_test = {}, {}
X_test_cts = {}
vectorizer = {}

def data_splitter():
    
    for i in data:
        X[i] = corpus[i]
        y[i] = data[i]['label']
        X_train[i], X_test[i], y_train[i], y_test[i] = train_test_split(X[i], y[i], train_size=0.90, test_size=0.10, random_state=101)
        vectorizer[i] = CountVectorizer()
        X_train_cts[i] = vectorizer[i].fit_transform(X_train[i])
        X_test_cts[i] = vectorizer[i].transform(X_test[i])
    
    return X_train_cts, y_train, X_test_cts, y_test

X_train, y_train, X_test, y_test = data_splitter()
    

In [7]:
## A sanity check to see that the number of feature sets are the same as the number of labels for each dataset.

for i in X_train:
    print(f"DATASET: {i}")
    print("Training features shape: ", X_train[i].shape)
    print("Training labels shape: ", y_train[i].shape)
    print("==========================================")

DATASET: de_v
Training features shape:  (1643, 2545)
Training labels shape:  (1643,)
DATASET: es_v
Training features shape:  (3469, 2595)
Training labels shape:  (3469,)
DATASET: fi_v
Training features shape:  (6344, 3461)
Training labels shape:  (6344,)
DATASET: fi_na
Training features shape:  (5399, 4402)
Training labels shape:  (5399,)
DATASET: de_n
Training features shape:  (2307, 3780)
Training labels shape:  (2307,)


**We will use LinearSVC as our first choice of model. If it ends up not delivering a good enough performance, we will switch to Multinomial Naive Bayes, or some type of Kernelized SVM.**

In [17]:
# Training and then generating test accuracy scores for each dataset.

clf = {}
print("================ TEST ACCURACY by dataset ================")
for i in data:
    clf[i] = LinearSVC(random_state=42, tol=1e-5, max_iter=1000, C=0.1)
    clf[i].fit(X_train[i], y_train[i])
    y_pred = clf[i].predict(X_test[i])
    print(i, ":", accuracy_score(y_pred, y_test[i]))

de_v : 0.7868852459016393
es_v : 0.9326424870466321
fi_v : 0.9404255319148936
fi_na : 0.7216666666666667
de_n : 0.7042801556420234


**Even with a relatively naive approach, our accuracies are pretty respectable. Especially when compared to the state-of-the-art scores of 80% for _de_n_ and 99% for _es_v_.**