In [55]:
import sys, os, random, re
import numpy as np
import pickle as pkl
from featurizer import LexicalFeaturizer
from sklearn import tree
from sklearn.metrics import confusion_matrix
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression


# import tensorflow.contrib.learn as skflow

In [5]:
data_dir = '../data/ICNALE_Written_Essays_2.3'
merged_plain_dir = '{}/Merged/Plain Text'.format(data_dir)
merged_tagged_dir = '{}/Merged/Tagged'.format(data_dir)

In [6]:
level_mapping = {
    'A2': 3,
    'B1': 2,
    'B2': 1,
}

In [23]:
unigram_dict = Counter()
unigram_POS = Counter()

bigram_dict = Counter()
bigram_POS = Counter()

script_length_dict = {}   #should have actual and POS one (POS adds an extra length whenever a sentence ends or grammar)

punct_regex = re.compile("[/.!,?\s]")  # end of sentence
grammar_regex = re.compile("[,]")

In [92]:
def parse_merged_plain_v1():
    data = []
    labels = []
    featurizer = LexicalFeaturizer()
    
    # Begins reading the merged plain file
    for path in sorted(os.listdir(merged_plain_dir)):
        file_name, file_ext = path.split('.')
        attributes = file_name.split('_')

        if attributes[3] in level_mapping:
            level = level_mapping[attributes[3]]
        else: 
            level = 0
            
#         if level == 1: print(path)
        sample_counter = 0
        sample_avg_words = 0
        with open('{}/{}'.format(merged_plain_dir, path), 'r', encoding='utf-8-sig') as file:
            for sample in file:
                if sample == '\n': continue
                sample = sample.strip('\n')
                
                sample_words = sample.split()
                paragraph_len = len(sample_words)
                sample_avg_words += paragraph_len
                paragraph_gram_len = len(sample_words) + 1
                for i in range(paragraph_len):
                    cur_word = sample_words[i].lower()
                    cur_word = [ w for w in punct_regex.split(cur_word) if w]
                    if len(cur_word) <= 0:
                        continue
                    else:
                        cur_word = cur_word[0]
#                     if punct_regex.search(cur_word):
#                         paragraph_gram_len += 3
#                     elif grammar_regex.search(cur_word):
#                         paragraph_gram_len += 1
                    unigram_dict[cur_word] += 1
                    
                    if i == 0:
                        bigram = "<s>"
                    else:
                        bigram = sample_words[i - 1].lower()
                    bigram += " " + cur_word
                    bigram_dict[bigram] += 1
                    if i == paragraph_len - 1:
                        final_bigram = cur_word + " </s>"
                        bigram_dict[final_bigram] += 1

                script_length_dict[file_name+str(sample_counter)] = (paragraph_len, paragraph_gram_len)
                sample_counter += 1
#         print(sample_counter)
#         print(sample_avg_words)
#         print(sample_avg_words/sample_counter)

        with open('{}/{}'.format(merged_plain_dir, path), 'r', encoding='utf-8-sig') as file:
            for sample in file:
                if sample == '\n': continue
                sample = sample.strip('\n')
                
                p_features = featurizer.featurize(sample)
                word_features = []
                sample_words = sample.split()
                paragraph_len = len(sample_words)
                words = [ w for w in punct_regex.split(sample) if w]
                most_common = Counter(words).most_common(20)
                for i in range(len(most_common)):
                    word, count = most_common[i]
                    denom = unigram_dict[word.lower()]
                    if denom == 0:
                        print("\n<<<<<<<-----------------")
                        print(unigram_dict[word.lower()])
                        print(word.lower())
                        word_features.append(1)
                        print("\n\n")
                    else:
                        word_features.append(count/denom + 1)
                data.append(np.array(p_features))# + p_features))  #TODO: add avg_sent_len and number of sentence))
                labels.append(level)

            
    return data, labels

In [26]:
f1_scorer = make_scorer(f1_score, average="macro")

In [96]:
# data, labels = parse_merged_plain_v1()
X_train, X_test, y_train, y_test = train_test_split(
            data, labels, test_size=0.33, random_state=42)

# parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
# parameters = {'max_depth': range(10,50, 10), 'splitter': ('best', 'random')}
# clf = GridSearchCV(LogisticRegression(), parameters, scoring=f1_scorer)
clf = LogisticRegression(solver="saga", multi_class="multinomial", max_iter=100, verbose=1)
# clf = tree.DecisionTreeClassifier(max_depth=8)
clf.fit(X_train, y_train)

max_iter reached after 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=1, warm_start=False)

In [97]:
# y_true = labels
# cnt = Counter(labels)
# print(cnt)
# print(clf.get_params(deep=False))
y_pred_train = clf.predict(X_train)
print(confusion_matrix(y_train, y_pred_train))
# print(clf.best_params_)
print(f1_score(y_train, y_pred_train, average='macro'))

[[   1    0  276    0]
 [   0    0  299    0]
 [   0    0 2378    1]
 [   0    0  675    0]]
0.19973430278583854


  'precision', 'predicted', average, warn_for)


In [98]:
# y_true = labels
# cnt = Counter(labels)
# print(cnt)
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(f1_score(y_test, y_pred, average='macro'))

[[   3    0  120    0]
 [   0    0  165    0]
 [   1    0 1165    0]
 [   0    0  334    1]]
0.21075674597878657


  'precision', 'predicted', average, warn_for)


In [None]:
clf.predict([featurizer.featurize("I agree that it is important for college students to have apart time job. Nowadays, a large number of college students are having a part time job. Some of them hold that part time jobs can help them to adapt to the society well and give them many experiences. Take a friend of mine for example, when Lily was a college student, she went to supermarket as a promoter or went to be a family teacher every weekend. Then owing to her experiences, she quickly got a good job after graduating from university. in the other students' opinions, they think that they can buy goods they want their parents cannot afford. I know a student that he goes to a restaurant as a waiter at his part time to earn enough money what a computer needs. Thus, it is important for college students to have a part time job. However, our parents always don't agree us to get part time jobs. They are afraid that our study and safety. In my opinion, having a good part time job is good for students. We should pay attention to the advantages of part time jobs and make the most of them. Meanwhile, we should learn to get knowledge from the part time jobs and make them a helpful tool for our development.")])

In [None]:
sample = "In my opinion, I am strongly agree with the idea that, it is important for our college students to have a part-time job. Now we can see everywhere that student take part –jobs like private teacher, seller and so on. So why they do these jobs? What advantages benefit us? I think the main reason is money. With the improvement of our living standards, a lot of study material are expensive than before. Then the student has to do some jobs to reduce considerable financial pressure. By doing so, we can also have the ability to travel or buy some items we like. We also hope that through this way, we can no longer dependent on our parents. In addition, we can also accumulate some social experience. From kindergarten to high school, what we learned is totally the knowledge from books. In this way, it does a lot benefits to our future jobs. In a word, there are many advantages for our college students to do some jobs. Not only make money, but also develop our independence. So if you a lot of free time after class, try to look for a part-time job and you will not regret about it."
sampleb2 = "Whether college students should take part-time jobs has aroused great public concern. As far as I'm concerned, part-time job plays an increasing important role in college life and I'm for the idea. Nevertheless, several people argue that it is a waste of time to take a part-time job and obviously, college students ought to put more emphasis on their study. In addition, part-time job not only takes up much of their free time but also means little to them. On the contrary, others think that it is acceptable, reasonable, and beneficial to take a part-time job. The reasons are as follows. First of all, it is undeniable that college students can accumulate certain experience through part-time jobs. Then, as a result of doing part-time jobs, we can earn some pocket money in order to reduce parents' burden. What's more, admittedly, if you do some more work at your free time, you will find life is not so dull and you can do something meaningful to enjoy your college life. In a word, from my perspective, part-time job, to some extent, is indispensable to our college life. Regardless of other people's ideas, I support my own opinion and I think it is essential for college students to take part-time job."
beginner = "My family is big family. I have one elder sister and two younger sister. My elder sister is nurse. She very friendly. She have a one’s child. He name is Jong Youn. Jong Youn is very cute, so we are very happy. Jong Youn is very smiling. He looks good. My one’s younger sister draw very well. She’s name is Su Jeong. Su Jeong is small body. She dream is artist so she is everyday draw. My one’s younger sister name is Suhyun. She is 16 years old. She is student. So everyday study. My father is very busy. My mother too. My mother everyday cooking. Sometimes, my father help them. I have a my husband."
intermediate = "My home country is Japan, and I was born Tokyo in Japan. Tokyo is a capital city in Japan, so there are many people live in and enjoy their life. I’m Japanese, but I don’t know much about Japan, because I hate history, so I talk about the capital city of Tokyo. My hometown was a small country city, and there were nothing except for a small market and a convenience store. Also, the convenience store was far from my house, it took 30 minutes walk by myself. My friends and me were always played our school ground or a park that near from our house. I think, Tokyo is very famous for have a many entertainments. However, that is only center city of Tokyo. When I tell people who is from other countries, about my countries, they said Tokyo!! I know Tokyo. Although, that is totally different that their image or thought of Tokyo."
advanced = "I don’t usually drive to the campus, but the other day I woke up really late and I was going to miss my class. I took my morning shower, put on my clothes in five minutes and jumped into my car. After three minutes I arrived to the parking lot next to the Butler building. As expected, it was totally jammed. After circulating the area for more than ten times I managed to squeeze my car between a Mustang and a truck. After class I wanted to go out for a break. I started backing of the parking space looking to left. I just did not want to scratch that beautiful Mustang on my left side. And while I’m staring at it. All of a sudden I heard a crack sound. I looked to right to see that my right side mirror was totally in the truck taillight. I panicked for a moment. That was my first accident."
unique = "In my opinion, I am strongly agree with the idea that,"

In [None]:
featurizer = LexicalFeaturizer()
featurizer.featurize(beginner)

In [None]:
featurizer.featurize(intermediate)

In [None]:
featurizer.featurize(advanced)

In [None]:
featurizer.featurize(sample)

In [None]:
featurizer.featurize(sampleb2)

In [None]:
featurizer.featurize(unique)

In [None]:
import re, string
pattern = re.compile('([^\s\w]|_)+')
print(re.sub(pattern, '', unique))

In [None]:
print(len("I a b c d e g h i j k l m n o p q r s t u v w z x".split()))