In [1]:
import sys, os, random, re
import numpy as np
import pickle as pkl
from featurizer import LexicalFeaturizer
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
def n_grams(texts, low, high):
    n_vectorizer = CountVectorizer(ngram_range=(low, high))
    counts = n_vectorizer.fit_transform(texts)
    print n_vectorizer.get_feature_names()
    return counts.toarray().astype(int)               

In [3]:
n_grams(['dog dog went on'], 1, 2)
n_grams(['dog went by'], 1, 2)
n_grams(['dog went by', 'dog dog went on'], 1, 2)

[u'dog', u'dog dog', u'dog went', u'on', u'went', u'went on']
[u'by', u'dog', u'dog went', u'went', u'went by']
[u'by', u'dog', u'dog dog', u'dog went', u'on', u'went', u'went by', u'went on']


array([[1, 1, 0, 1, 0, 1, 1, 0],
       [0, 2, 1, 1, 1, 1, 0, 1]])

## Data Parsing

In [4]:
data_dir = '../data/ICNALE_Written_Essays_2.3'
merged_plain_dir = '{}/Merged/Plain Text'.format(data_dir)
merged_tagged_dir = '{}/Merged/Tagged'.format(data_dir)

### Define Classes

In [5]:
level_mapping = {
    'A2_0': 4,
    'B1_1': 3,
    'B1_2': 2,
    'B2_0': 1,
}

In [6]:
punct_regex = re.compile("[/.!,?\s]")
grammar_regex = re.compile("[,]")

### Feature extraction

In [7]:
def parse_merged_plain_v1():
    script_length_dict = {} 
    
    unigram_dict = Counter()
    unigram_POS = Counter()

    bigram_dict = Counter()
    bigram_POS = Counter()    
    
    data = []
    labels = []
    featurizer = LexicalFeaturizer()
    
    # Begins reading the merged plain file
    for path in sorted(os.listdir(merged_plain_dir)):
        file_name, file_ext = path.split('.')
        attributes = file_name.split('_')

        if len(attributes) == 4:
            level = 0
        else:
            level = level_mapping['{}_{}'.format(attributes[3], attributes[4])]
            
        sample_counter = 0
        sample_avg_words = 0
        with open('{}/{}'.format(merged_plain_dir, path), 'r', encoding='utf-8-sig') as file:
            for sample in file:
                if sample == '\n': continue
                sample = sample.strip('\n')
                
                sample_words = sample.split()
                paragraph_len = len(sample_words)
                sample_avg_words += paragraph_len
                paragraph_gram_len = len(sample_words) + 1
                for i in range(paragraph_len):
                    cur_word = sample_words[i].lower()
                    cur_word = [ w for w in punct_regex.split(cur_word) if w]
                    if len(cur_word) <= 0:
                        continue
                    else:
                        cur_word = cur_word[0]
                    unigram_dict[cur_word] += 1
                    
                    if i == 0:
                        bigram = "<s>"
                    else:
                        bigram = sample_words[i - 1].lower()
                    bigram += " " + cur_word
                    bigram_dict[bigram] += 1
                    if i == paragraph_len - 1:
                        final_bigram = cur_word + " </s>"
                        bigram_dict[final_bigram] += 1

                script_length_dict[file_name+str(sample_counter)] = (paragraph_len, paragraph_gram_len)
                sample_counter += 1

        with open('{}/{}'.format(merged_plain_dir, path), 'r', encoding='utf-8-sig') as file:
            for sample in file:
                if sample == '\n': continue
                sample = sample.strip('\n')

                p_features = featurizer.featurize(sample)
                word_features = []
                sample_words = sample.split()
                words = [ w for w in punct_regex.split(sample) if w]
                paragraph_len = len(words)
                most_common = Counter(words).most_common(200)
                for i in range(paragraph_len):
                    cur_word = words[i]
                    count = unigram_dict[cur_word.lower()]
                    if count != 0:
                        word_features.append(count/100)
                data.append(np.array(p_features + word_features[:150])) #TODO: add avg_sent_len and number of sentence))
                labels.append(level)
    return data, labels
data, labels = parse_merged_plain_v1()

IOError: [Errno 2] No such file or directory: '../data/common_freq.csv'

### Trim class distribution

In [None]:
data_by_label = {}
for i in range(len(data)):
    if labels[i] not in data_by_label:
        data_by_label[labels[i]] = [data[i]]
    else:
        data_by_label[labels[i]].append(data[i])
trimmed_data = []
trimmed_labels = []
for label in data_by_label:
    trimmed_data.extend(data_by_label[label][:400])
    trimmed_labels.extend([label for i in range(400)])

### Split into train/val/test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
            trimmed_data, trimmed_labels, test_size=0.20)
X_train, X_val, y_train, y_val = train_test_split(
            X_train, y_train, test_size=0.20)

# Logistic Regression

In [None]:
clf_logreg = LogisticRegression(solver="saga", 
                         multi_class="multinomial", 
                         max_iter=1000,
                         verbose=1)
clf_logreg.fit(X_train, y_train)

In [None]:
labels_names = ['Native', 'B2_0', 'B1_2', 'B1_1', 'A2_0']

print('TRAIN')
y_pred_train = clf_logreg.predict(X_train)
print(confusion_matrix(y_train, y_pred_train))
scores = f1_score(y_train, y_pred_train, average=None)

print()
print('F1 VALUES')
for i in range(len(labels_names)):
    print('{0:}:\t {1:.2f}'.format(labels_names[i], scores[i]))
print('Total:\t {0:.2f}'.format(f1_score(y_train, y_pred_train, average='macro')))

print('\n')

print('VALIDATION')
y_pred = clf_logreg.predict(X_val)
print(confusion_matrix(y_val, y_pred))
scores = f1_score(y_val, y_pred, average=None)
print()
print('F1 VALUES')
for i in range(len(labels_names)):
    print('{0:}:\t {1:.2f}'.format(labels_names[i], scores[i]))
print('Total:\t {0:.2f}'.format(f1_score(y_val, y_pred, average='macro')))

In [None]:
print('TEST')
y_pred = clf_logreg.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('F1: {}'.format(f1_score(y_test, y_pred, average='macro')))

## Decision Tree

In [None]:
clf_dectree = tree.DecisionTreeClassifier(max_depth=10)
clf_dectree.fit(X_train, y_train)

In [None]:
labels_names = ['Native', 'B2_0', 'B1_2', 'B1_1', 'A2_0']

print('TRAIN')
y_pred_train = clf_dectree.predict(X_train)
print(confusion_matrix(y_train, y_pred_train))
scores = f1_score(y_train, y_pred_train, average=None)

print()
print('F1 VALUES')
for i in range(len(labels_names)):
    print('{0:}:\t {1:.2f}'.format(labels_names[i], scores[i]))
print('Total:\t {0:.2f}'.format(f1_score(y_train, y_pred_train, average='macro')))

print('\n')

print('VALIDATION')
y_pred = clf_dectree.predict(X_val)
print(confusion_matrix(y_val, y_pred))
scores = f1_score(y_val, y_pred, average=None)
print()
print('F1 VALUES')
for i in range(len(labels_names)):
    print('{0:}:\t {1:.2f}'.format(labels_names[i], scores[i]))
print('Total:\t {0:.2f}'.format(f1_score(y_val, y_pred, average='macro')))

In [None]:
print('TEST')
y_pred = clf_dectree.predict(X_test)
print(confusion_matrix(y_test, y_pred))
scores = f1_score(y_test, y_pred, average=None)
print()
print('F1 VALUES')
for i in range(len(labels_names)):
    print('{0:}:\t {1:.2f}'.format(labels_names[i], scores[i]))
print('Total:\t {0:.2f}'.format(f1_score(y_test, y_pred, average='macro')))