In [1]:
import sys, os, random, re
import numpy as np
import pickle as pkl
from featurizer import LexicalFeaturizer
from sklearn import tree
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

## Data Parsing

In [2]:
data_dir = '../data/ICNALE_Written_Essays_2.3'
merged_plain_dir = '{}/Merged/Plain Text'.format(data_dir)
merged_tagged_dir = '{}/Merged/Tagged'.format(data_dir)

### Define Classes

In [125]:
level_mapping = {
    'A2_0': 4,
    'B1_1': 3,
    'B1_2': 2,
    'B2_0': 1,
}

In [157]:
punct_regex = re.compile("[/.!,?\s]")
grammar_regex = re.compile("[,]")

### Feature extraction

In [158]:
def parse_merged_plain_v1():
    script_length_dict = {} 
    
    unigram_dict = Counter()
    unigram_POS = Counter()

    bigram_dict = Counter()
    bigram_POS = Counter()    
    
    data = []
    labels = []
    featurizer = LexicalFeaturizer()
    
    # Begins reading the merged plain file
    for path in sorted(os.listdir(merged_plain_dir)):
        file_name, file_ext = path.split('.')
        attributes = file_name.split('_')

        if len(attributes) == 4:
            level = 0
        else:
            level = level_mapping['{}_{}'.format(attributes[3], attributes[4])]
            
        sample_counter = 0
        sample_avg_words = 0
        with open('{}/{}'.format(merged_plain_dir, path), 'r', encoding='utf-8-sig') as file:
            for sample in file:
                if sample == '\n': continue
                sample = sample.strip('\n')
                
                sample_words = sample.split()
                paragraph_len = len(sample_words)
                sample_avg_words += paragraph_len
                paragraph_gram_len = len(sample_words) + 1
                for i in range(paragraph_len):
                    cur_word = sample_words[i].lower()
                    cur_word = [ w for w in punct_regex.split(cur_word) if w]
                    if len(cur_word) <= 0:
                        continue
                    else:
                        cur_word = cur_word[0]
                    unigram_dict[cur_word] += 1
                    
                    if i == 0:
                        bigram = "<s>"
                    else:
                        bigram = sample_words[i - 1].lower()
                    bigram += " " + cur_word
                    bigram_dict[bigram] += 1
                    if i == paragraph_len - 1:
                        final_bigram = cur_word + " </s>"
                        bigram_dict[final_bigram] += 1

                script_length_dict[file_name+str(sample_counter)] = (paragraph_len, paragraph_gram_len)
                sample_counter += 1

        with open('{}/{}'.format(merged_plain_dir, path), 'r', encoding='utf-8-sig') as file:
            for sample in file:
                if sample == '\n': continue
                sample = sample.strip('\n')

                p_features = featurizer.featurize(sample)
                word_features = []
                sample_words = sample.split()
                words = [ w for w in punct_regex.split(sample) if w]
                paragraph_len = len(words)
                most_common = Counter(words).most_common(200)
                for i in range(paragraph_len):
                    cur_word = words[i]
                    count = unigram_dict[cur_word.lower()]
                    if count != 0:
                        word_features.append(count/100)
                data.append(np.array(word_features[:150] + p_features)) #TODO: add avg_sent_len and number of sentence))
                labels.append(level)
    return data, labels
data, labels = parse_merged_plain_v1()

### Trim class distribution

In [159]:
data_by_label = {}
for i in range(len(data)):
    if labels[i] not in data_by_label:
        data_by_label[labels[i]] = [data[i]]
    else:
        data_by_label[labels[i]].append(data[i])
trimmed_data = []
trimmed_labels = []
for label in data_by_label:
    trimmed_data.extend(data_by_label[label][:400])
    trimmed_labels.extend([label for i in range(400)])

### Split into train/val/test

In [162]:
X_train, X_test, y_train, y_test = train_test_split(
            trimmed_data, trimmed_labels, test_size=0.20)
X_train, X_val, y_train, y_val = train_test_split(
            X_train, y_train, test_size=0.20)

# Logistic Regression

In [163]:
clf_logreg = LogisticRegression(solver="saga", 
                         multi_class="multinomial", 
                         max_iter=10000,
                         verbose=1)
clf_logreg.fit(X_train, y_train)

convergence after 6739 epochs took 42 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   41.3s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=1, warm_start=False)

In [164]:
print('TRAIN')
y_pred_train = clf_logreg.predict(X_train)
print(confusion_matrix(y_train, y_pred_train))
print('F1: {}'.format(f1_score(y_train, y_pred_train, average='macro')))
print('VAL')
y_pred = clf_logreg.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print('F1: {}'.format(f1_score(y_val, y_pred, average='macro')))

TRAIN
[[129  15  34  38  43]
 [ 10 221  11   4   6]
 [ 32  17 104  53  46]
 [ 29  12  34 140  43]
 [ 18  30  13  84 114]]
F1: 0.5489325353996829
VAL
[[10  8 18 12 25]
 [ 6 30  9  3 10]
 [10  7 19 23 12]
 [ 6  5 12 21 13]
 [12  6  7 20 16]]
F1: 0.3038135793734537


In [168]:
print('TEST')
y_pred = clf_dectree.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('F1: {}'.format(f1_score(y_test, y_pred, average='macro')))

TEST
[[45  1 12  5  5]
 [ 3 55 16 12  4]
 [13  6 34 19  5]
 [ 8  7 12 51  7]
 [ 5  2  6 10 57]]
F1: 0.6064224394911235


## Decision Tree

In [165]:
clf_dectree = tree.DecisionTreeClassifier(max_depth=10)
clf_dectree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [166]:
print('TRAIN')
y_pred_train = clf_dectree.predict(X_train)
print(confusion_matrix(y_train, y_pred_train))
print('F1: {}'.format(f1_score(y_train, y_pred_train, average='macro')))
print('VAL')
y_pred = clf_dectree.predict(X_val)
print(confusion_matrix(y_val, y_pred))
print('F1: {}'.format(f1_score(y_val, y_pred, average='macro')))

TRAIN
[[240   3  12   4   0]
 [  4 234   9   4   1]
 [ 10   3 229  10   0]
 [ 14  10  10 223   1]
 [  6   4   7  10 232]]
F1: 0.9050669374580924
VAL
[[40  7 18  7  1]
 [ 5 36 12  3  2]
 [10  7 32 15  7]
 [ 4  4 12 35  2]
 [ 5  1  5  6 44]]
F1: 0.5938037714585331


In [167]:
print('TEST')
y_pred = clf_dectree.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('F1: {}'.format(f1_score(y_test, y_pred, average='macro')))

TEST
[[45  1 12  5  5]
 [ 3 55 16 12  4]
 [13  6 34 19  5]
 [ 8  7 12 51  7]
 [ 5  2  6 10 57]]
F1: 0.6064224394911235
