In [1]:
import pandas as pd
import numpy as np
from joblib import dump, load

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, KFold
from sklearn.linear_model import ElasticNet, LinearRegression, LogisticRegression

from datetime import datetime


import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import language_check
from string import punctuation

In [2]:
# kappa metric for measuring agreement of automatic to human scores
from sklearn.metrics import cohen_kappa_score, confusion_matrix

def kappa(y_true, y_pred, weights=None, allow_off_by_one=False):
    """
    From https://github.com/EducationalTestingService/skll/blob/master/skll/metrics.py
    
    Calculates the kappa inter-rater agreement between two the gold standard
    and the predicted ratings. Potential values range from -1 (representing
    complete disagreement) to 1 (representing complete agreement).  A kappa
    value of 0 is expected if all agreement is due to chance.
    In the course of calculating kappa, all items in ``y_true`` and ``y_pred`` will
    first be converted to floats and then rounded to integers.
    It is assumed that y_true and y_pred contain the complete range of possible
    ratings.
    This function contains a combination of code from yorchopolis's kappa-stats
    and Ben Hamner's Metrics projects on Github.
    Parameters
    ----------
    y_true : array-like of float
        The true/actual/gold labels for the data.
    y_pred : array-like of float
        The predicted/observed labels for the data.
    weights : str or np.array, optional
        Specifies the weight matrix for the calculation.
        Options are ::
            -  None = unweighted-kappa
            -  'quadratic' = quadratic-weighted kappa
            -  'linear' = linear-weighted kappa
            -  two-dimensional numpy array = a custom matrix of
        weights. Each weight corresponds to the
        :math:`w_{ij}` values in the wikipedia description
        of how to calculate weighted Cohen's kappa.
        Defaults to None.
    allow_off_by_one : bool, optional
        If true, ratings that are off by one are counted as
        equal, and all other differences are reduced by
        one. For example, 1 and 2 will be considered to be
        equal, whereas 1 and 3 will have a difference of 1
        for when building the weights matrix.
        Defaults to False.
    Returns
    -------
    k : float
        The kappa score, or weighted kappa score.
    Raises
    ------
    AssertionError
        If ``y_true`` != ``y_pred``.
    ValueError
        If labels cannot be converted to int.
    ValueError
        If invalid weight scheme.
    """

    # Ensure that the lists are both the same length
    assert(len(y_true) == len(y_pred))

    # This rather crazy looking typecast is intended to work as follows:
    # If an input is an int, the operations will have no effect.
    # If it is a float, it will be rounded and then converted to an int
    # because the ml_metrics package requires ints.
    # If it is a str like "1", then it will be converted to a (rounded) int.
    # If it is a str that can't be typecast, then the user is
    # given a hopefully useful error message.
    try:
        y_true = [int(np.round(float(y))) for y in y_true]
        y_pred = [int(np.round(float(y))) for y in y_pred]
    except ValueError:
        raise ValueError("For kappa, the labels should be integers or strings "
                         "that can be converted to ints (E.g., '4.0' or '3').")

    # Figure out normalized expected values
    min_rating = min(min(y_true), min(y_pred))
    max_rating = max(max(y_true), max(y_pred))

    # shift the values so that the lowest value is 0
    # (to support scales that include negative values)
    y_true = [y - min_rating for y in y_true]
    y_pred = [y - min_rating for y in y_pred]

    # Build the observed/confusion matrix
    num_ratings = max_rating - min_rating + 1
    observed = confusion_matrix(y_true, y_pred,
                                labels=list(range(num_ratings)))
    num_scored_items = float(len(y_true))

    # Build weight array if weren't passed one
    if isinstance(weights, str):
        wt_scheme = weights
        weights = None
    else:
        wt_scheme = ''
    if weights is None:
        weights = np.empty((num_ratings, num_ratings))
        for i in range(num_ratings):
            for j in range(num_ratings):
                diff = abs(i - j)
                if allow_off_by_one and diff:
                    diff -= 1
                if wt_scheme == 'linear':
                    weights[i, j] = diff
                elif wt_scheme == 'quadratic':
                    weights[i, j] = diff ** 2
                elif not wt_scheme:  # unweighted
                    weights[i, j] = bool(diff)
                else:
                    raise ValueError('Invalid weight scheme specified for '
                                     'kappa: {}'.format(wt_scheme))

    hist_true = np.bincount(y_true, minlength=num_ratings)
    hist_true = hist_true[: num_ratings] / num_scored_items
    hist_pred = np.bincount(y_pred, minlength=num_ratings)
    hist_pred = hist_pred[: num_ratings] / num_scored_items
    expected = np.outer(hist_true, hist_pred)

    # Normalize observed array
    observed = observed / num_scored_items

    # If all weights are zero, that means no disagreements matter.
    k = 1.0
    if np.count_nonzero(weights):
        k -= (sum(sum(weights * observed)) / sum(sum(weights * expected)))

    return k

In [3]:
essay_sets = pd.read_pickle('training_features.pkl')

In [4]:
all_features = [  
                'word_count',
                'corrections',
                'similarity',
                'token_count',
                'unique_token_count',
                'nostop_count',
                'sent_count',
                'ner_count',
                'comma',
                'question',
                'exclamation',
                'quotation',
                'organization',
                'caps',
                'person',
                'location',
                'money',
                'time',
                'date',
                'percent',
                'noun',
                'adj',
                'pron',
                'verb',
                'cconj',
                'adv',
                'det',
                'propn',
                'num',
                'part',
                'intj',
                ]

In [5]:
essay_sets[all_features]

Unnamed: 0,word_count,corrections,similarity,token_count,unique_token_count,nostop_count,sent_count,ner_count,comma,question,...,adj,pron,verb,cconj,adv,det,propn,num,part,intj
0,338,11,0.953891,396,181,204,19,3,18,2,...,18,36,51,14,18,32,5,0,16,2
1,419,19,0.954198,456,206,237,23,12,14,1,...,23,35,83,18,26,45,7,5,10,1
2,279,9,0.951935,305,162,153,23,5,9,0,...,19,20,40,16,13,32,1,3,10,0
3,524,35,0.966408,579,266,332,35,15,14,1,...,41,21,80,17,25,55,29,0,24,2
4,465,17,0.955189,516,211,252,30,6,13,0,...,27,30,80,16,40,60,3,3,21,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12971,845,15,0.925195,954,347,449,51,41,50,0,...,41,125,134,55,76,79,32,16,24,5
12972,546,41,0.957361,644,230,329,55,24,22,10,...,37,67,82,28,54,64,12,5,17,5
12973,817,23,0.952395,954,381,513,52,29,47,7,...,38,68,122,39,64,112,36,5,21,5
12974,562,13,0.969016,666,258,355,44,10,40,2,...,44,77,78,22,62,67,11,4,10,1


In [6]:
X = essay_sets[all_features]
y = essay_sets['adjusted_score'].astype(np.float64)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=26)

In [8]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

dump(sc, 'std_scaler.bin', compress=True)

['std_scaler.bin']

In [9]:
sc=load('std_scaler.bin')

In [10]:
y_train

4000     100.00
8867      25.75
9439     100.00
10093     75.25
7481      75.25
          ...  
9318       1.00
10177    100.00
9648      75.25
5894      34.00
4917      34.00
Name: adjusted_score, Length: 9083, dtype: float64

In [11]:
X_train_std

array([[-0.21215784, -0.47114778,  0.16915286, ..., -0.65016721,
        -0.50962189, -0.41331823],
       [-0.85046801, -1.11636985, -0.68560563, ..., -0.65016721,
        -0.87507064,  0.38273128],
       [ 0.20188119, -0.68622181,  0.78143018, ...,  0.46459801,
        -0.38780564, -0.41331823],
       ...,
       [-0.40192573, -0.36361077, -0.12389136, ..., -0.65016721,
        -0.38780564, -0.41331823],
       [-1.01723373, -0.5786848 , -0.82877173, ..., -0.65016721,
        -0.75325439, -0.41331823],
       [-0.22365892, -0.68622181,  0.62496141, ..., -0.0927846 ,
        -0.50962189, -0.41331823]])

In [19]:
t0 = datetime.now()

paramgrid = {'l1_ratio': [.01, .1, .5, .9], 'alpha': [0.01, .1, 1]}

gs = GridSearchCV(ElasticNet(max_iter=100000, random_state=26),
                  param_grid=paramgrid,
                  cv=5)

gs.fit(X_train, y_train)
dump(gs, 'elasticnet.joblib') 

y_pred = gs.predict(X_test)

print('Accuracy: ', gs.best_score_)
print(' ')

print('kappa = ', kappa(y_pred, y_test, weights='quadratic'))

t1 = datetime.now()
print('Processing time: {}'.format(t1 - t0))

Accuracy:  0.38921601638812353
 
kappa =  0.5475281291586523
Processing time: 0:02:02.955716


In [204]:
t0 = datetime.now()

parameters = {
    "kernel": ["rbf"],
    "C": [1,10,10,100,1000],
    "gamma": [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
    }

grid_cv = GridSearchCV(svm.SVR(), parameters, cv=5, verbose=2)
grid_cv.fit(X_train_std, y_train)

print("Best parameters set found on development set:")
print(grid_cv.best_params_)

t1 = datetime.now()
print('Processing time: {}'.format(t1 - t0))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] C=1, gamma=1e-08, kernel=rbf ....................................
[CV] ..................... C=1, gamma=1e-08, kernel=rbf, total=   2.7s
[CV] C=1, gamma=1e-08, kernel=rbf ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.6s remaining:    0.0s


[CV] ..................... C=1, gamma=1e-08, kernel=rbf, total=   2.6s
[CV] C=1, gamma=1e-08, kernel=rbf ....................................
[CV] ..................... C=1, gamma=1e-08, kernel=rbf, total=   2.5s
[CV] C=1, gamma=1e-08, kernel=rbf ....................................
[CV] ..................... C=1, gamma=1e-08, kernel=rbf, total=   2.6s
[CV] C=1, gamma=1e-08, kernel=rbf ....................................
[CV] ..................... C=1, gamma=1e-08, kernel=rbf, total=   2.5s
[CV] C=1, gamma=1e-07, kernel=rbf ....................................
[CV] ..................... C=1, gamma=1e-07, kernel=rbf, total=   2.6s
[CV] C=1, gamma=1e-07, kernel=rbf ....................................
[CV] ..................... C=1, gamma=1e-07, kernel=rbf, total=   2.5s
[CV] C=1, gamma=1e-07, kernel=rbf ....................................
[CV] ..................... C=1, gamma=1e-07, kernel=rbf, total=   2.4s
[CV] C=1, gamma=1e-07, kernel=rbf ....................................
[CV] .

[CV] .................... C=10, gamma=1e-05, kernel=rbf, total=   2.8s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV] ................... C=10, gamma=0.0001, kernel=rbf, total=   2.9s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV] ................... C=10, gamma=0.0001, kernel=rbf, total=   2.8s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV] ................... C=10, gamma=0.0001, kernel=rbf, total=   2.8s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV] ................... C=10, gamma=0.0001, kernel=rbf, total=   2.7s
[CV] C=10, gamma=0.0001, kernel=rbf ..................................
[CV] ................... C=10, gamma=0.0001, kernel=rbf, total=   2.8s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .................... C=10, gamma=0.001, kernel=rbf, total=   2.8s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[CV] ...................... C=10, gamma=0.1, kernel=rbf, total=   3.1s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ...................... C=10, gamma=0.1, kernel=rbf, total=   3.2s
[CV] C=10, gamma=0.1, kernel=rbf .....................................
[CV] ...................... C=10, gamma=0.1, kernel=rbf, total=   3.1s
[CV] C=100, gamma=1e-08, kernel=rbf ..................................
[CV] ................... C=100, gamma=1e-08, kernel=rbf, total=   2.5s
[CV] C=100, gamma=1e-08, kernel=rbf ..................................
[CV] ................... C=100, gamma=1e-08, kernel=rbf, total=   2.5s
[CV] C=100, gamma=1e-08, kernel=rbf ..................................
[CV] ................... C=100, gamma=1e-08, kernel=rbf, total=   2.6s
[CV] C=100, gamma=1e-08, kernel=rbf ..................................
[CV] ................... C=100, gamma=1e-08, kernel=rbf, total=   2.5s
[CV] C=100, gamma=1e-08, kernel=rbf ..................................
[CV] .

[CV] .................. C=1000, gamma=1e-05, kernel=rbf, total=   2.8s
[CV] C=1000, gamma=1e-05, kernel=rbf .................................
[CV] .................. C=1000, gamma=1e-05, kernel=rbf, total=   2.8s
[CV] C=1000, gamma=1e-05, kernel=rbf .................................
[CV] .................. C=1000, gamma=1e-05, kernel=rbf, total=   2.8s
[CV] C=1000, gamma=1e-05, kernel=rbf .................................
[CV] .................. C=1000, gamma=1e-05, kernel=rbf, total=   2.8s
[CV] C=1000, gamma=1e-05, kernel=rbf .................................
[CV] .................. C=1000, gamma=1e-05, kernel=rbf, total=   2.7s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] ................. C=1000, gamma=0.0001, kernel=rbf, total=   2.8s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] ................. C=1000, gamma=0.0001, kernel=rbf, total=   2.9s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed: 11.3min finished


Best parameters set found on development set:
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Processing time: 0:11:24.824130


In [12]:
from sklearn import svm

t0 = datetime.now()

clf = svm.SVR(C=10, gamma=0.1)
clf.fit(X_train_std, y_train)
dump(clf, 'svr.joblib') 

y_pred = clf.predict(X_test_std)

# print('Accuracy: ', clf.best_score_)
print(' ')

print('kappa = ', kappa(y_pred, y_test, weights='quadratic'))

t1 = datetime.now()
print('Processing time: {}'.format(t1 - t0))

 
kappa =  0.703568914366227
Processing time: 0:00:05.170002


In [13]:
print(clf.score(X_test_std,y_test))

0.5438604025974494


In [14]:
X

Unnamed: 0,word_count,corrections,similarity,token_count,unique_token_count,nostop_count,sent_count,ner_count,comma,question,...,adj,pron,verb,cconj,adv,det,propn,num,part,intj
0,338,11,0.953891,396,181,204,19,3,18,2,...,18,36,51,14,18,32,5,0,16,2
1,419,19,0.954198,456,206,237,23,12,14,1,...,23,35,83,18,26,45,7,5,10,1
2,279,9,0.951935,305,162,153,23,5,9,0,...,19,20,40,16,13,32,1,3,10,0
3,524,35,0.966408,579,266,332,35,15,14,1,...,41,21,80,17,25,55,29,0,24,2
4,465,17,0.955189,516,211,252,30,6,13,0,...,27,30,80,16,40,60,3,3,21,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12971,845,15,0.925195,954,347,449,51,41,50,0,...,41,125,134,55,76,79,32,16,24,5
12972,546,41,0.957361,644,230,329,55,24,22,10,...,37,67,82,28,54,64,12,5,17,5
12973,817,23,0.952395,954,381,513,52,29,47,7,...,38,68,122,39,64,112,36,5,21,5
12974,562,13,0.969016,666,258,355,44,10,40,2,...,44,77,78,22,62,67,11,4,10,1


In [15]:
sample_essay = "In “Let there be dark,” Paul talks about the importance of darkness. Darkness is essential to humans. He states, “Our bodies need darkness to produce the hormone melatonin, which keeps certain cancers from developing, and our bodies need darkness for sleep, sleep. Sleep disorders have been linked to diabetes, obesity, cardiovascular disease and depression and recent research suggests are main cause of “short sleep” is “long light.” Whether we work at night or simply take our tablets, notebooks and smartphones to bed, there isn’t a place for this much artificial light in our lives.” (2). Here, He talks about the importance of darkness to humans. Humans need darkness to sleep in order to be healthy. Animals also need darkness. He states, “The rest of the world depends on darkness as well, including nocturnal and crepuscular species of birds, insects, mammals, fish and reptiles. Some examples are well known—the 400 species of birds that migrate at night in North America, the sea turtles that come ashore to lay their eggs—and some are not, such as the bats that save American farmers billions in pest control and the moths that pollinate 80% of the world’s flora. Ecological light pollution is like the bulldozer of the night, wrecking habitat and disrupting ecosystems several billion years in the making. Simply put, without darkness, Earth’s ecology would collapse...” (2). Here he explains that animals, too, need darkness to survive."
d = {'essay': [sample_essay]}
essay_data = pd.DataFrame(data=d)

essay_data

Unnamed: 0,essay
0,"In “Let there be dark,” Paul talks about the i..."


In [16]:
essay_data['word_count'] = essay_data['essay'].str.strip().str.split().str.len()

In [17]:
tool = language_check.LanguageTool('en-US')

essay_data['matches'] = essay_data['essay'].apply(lambda txt: tool.check(txt))
essay_data['corrections'] = essay_data.apply(lambda l: len(l['matches']), axis=1)
essay_data['corrected'] = essay_data.apply(lambda l: language_check.correct(l['essay'], l['matches']), axis=1)

essay_data[['matches', 'corrections', 'corrected']]

Unnamed: 0,matches,corrections,corrected
0,"[Line 1, column 163, Rule ID: MORFOLOGIK_RULE_...",2,"In “Let there be dark,” Paul talks about the i..."


In [18]:
print(essay_data['essay'][0])
print()
print(essay_data['corrected'][0])

In “Let there be dark,” Paul talks about the importance of darkness. Darkness is essential to humans. He states, “Our bodies need darkness to produce the hormone melatonin, which keeps certain cancers from developing, and our bodies need darkness for sleep, sleep. Sleep disorders have been linked to diabetes, obesity, cardiovascular disease and depression and recent research suggests are main cause of “short sleep” is “long light.” Whether we work at night or simply take our tablets, notebooks and smartphones to bed, there isn’t a place for this much artificial light in our lives.” (2). Here, He talks about the importance of darkness to humans. Humans need darkness to sleep in order to be healthy. Animals also need darkness. He states, “The rest of the world depends on darkness as well, including nocturnal and crepuscular species of birds, insects, mammals, fish and reptiles. Some examples are well known—the 400 species of birds that migrate at night in North America, the sea turtles t

In [19]:
import en_core_web_sm

sents = []
tokens = []
lemma = []
pos = []
ner = []

stop_words = set(STOP_WORDS)
stop_words.update(punctuation) # remove it if you need punctuation 

nlp = en_core_web_sm.load()

t0 = datetime.now()

# suppress numpy warnings
np.warnings.filterwarnings('ignore')

for essay in nlp.pipe(essay_data['corrected']):
    tokens.append([e.text for e in essay])
    sents.append([sent.string.strip() for sent in essay.sents])
    pos.append([e.pos_ for e in essay])
    ner.append([e.text for e in essay.ents])
    lemma.append([n.lemma_ for n in essay])


essay_data['tokens'] = tokens
essay_data['lemma'] = lemma
essay_data['pos'] = pos
essay_data['sents'] = sents
essay_data['ner'] = ner

In [20]:
essay_data[['tokens', 'pos', 'sents', 'ner']]

Unnamed: 0,tokens,pos,sents,ner
0,"[In, “, Let, there, be, dark, ,, ”, Paul, talk...","[ADP, PUNCT, VERB, PRON, AUX, ADJ, PUNCT, PUNC...","[In “Let there be dark,” Paul talks about the ...","[night, 2, 400, North America, American, billi..."


In [21]:
import random

reference_essays = {1: 161, 2: 3022, 3: 5263, 4: 5341, 5: 7209, 6: 8896, 7: 11796, 8: 12340} # topic: essay_id

references = {}

stop_words = set(STOP_WORDS)

# generate nlp object for reference essays:
for topic, index in reference_essays.items():
    references[topic] = nlp(essay_sets.iloc[index]['essay'])


def avg_similarity(essay):
    sim = 0
    
    for ref in references:
        sim += nlp(essay).similarity(references[ref])
    
    return sim / 8
    
    
essay_data['similarity'] = essay_data.apply(lambda row: avg_similarity(row['essay']), axis=1)

In [22]:
essay_data['similarity']

0    0.799273
Name: similarity, dtype: float64

In [23]:
essay_data['token_count'] = essay_data.apply(lambda x: len(x['tokens']), axis=1)
essay_data['unique_token_count'] = essay_data.apply(lambda x: len(set(x['tokens'])), axis=1)
essay_data['nostop_count'] = essay_data \
            .apply(lambda x: len([token for token in x['tokens'] if token not in stop_words]), axis=1)
essay_data['sent_count'] = essay_data.apply(lambda x: len(x['sents']), axis=1)
essay_data['ner_count'] = essay_data.apply(lambda x: len(x['ner']), axis=1)
essay_data['comma'] = essay_data.apply(lambda x: x['corrected'].count(','), axis=1)
essay_data['question'] = essay_data.apply(lambda x: x['corrected'].count('?'), axis=1)
essay_data['exclamation'] = essay_data.apply(lambda x: x['corrected'].count('!'), axis=1)
essay_data['quotation'] = essay_data.apply(lambda x: x['corrected'].count('"') + x['corrected'].count("'"), axis=1)
essay_data['organization'] = essay_data.apply(lambda x: x['corrected'].count(r'@ORGANIZATION'), axis=1)
essay_data['caps'] = essay_data.apply(lambda x: x['corrected'].count(r'@CAPS'), axis=1)
essay_data['person'] = essay_data.apply(lambda x: x['corrected'].count(r'@PERSON'), axis=1)
essay_data['location'] = essay_data.apply(lambda x: x['corrected'].count(r'@LOCATION'), axis=1)
essay_data['money'] = essay_data.apply(lambda x: x['corrected'].count(r'@MONEY'), axis=1)
essay_data['time'] = essay_data.apply(lambda x: x['corrected'].count(r'@TIME'), axis=1)
essay_data['date'] = essay_data.apply(lambda x: x['corrected'].count(r'@DATE'), axis=1)
essay_data['percent'] = essay_data.apply(lambda x: x['corrected'].count(r'@PERCENT'), axis=1)
essay_data['noun'] = essay_data.apply(lambda x: x['pos'].count('NOUN'), axis=1)
essay_data['adj'] = essay_data.apply(lambda x: x['pos'].count('ADJ'), axis=1)
essay_data['pron'] = essay_data.apply(lambda x: x['pos'].count('PRON'), axis=1)
essay_data['verb'] = essay_data.apply(lambda x: x['pos'].count('VERB'), axis=1)
essay_data['noun'] = essay_data.apply(lambda x: x['pos'].count('NOUN'), axis=1)
essay_data['cconj'] = essay_data.apply(lambda x: x['pos'].count('CCONJ'), axis=1)
essay_data['adv'] = essay_data.apply(lambda x: x['pos'].count('ADV'), axis=1)
essay_data['det'] = essay_data.apply(lambda x: x['pos'].count('DET'), axis=1)
essay_data['propn'] = essay_data.apply(lambda x: x['pos'].count('PROPN'), axis=1)
essay_data['num'] = essay_data.apply(lambda x: x['pos'].count('NUM'), axis=1)
essay_data['part'] = essay_data.apply(lambda x: x['pos'].count('PART'), axis=1)
essay_data['intj'] = essay_data.apply(lambda x: x['pos'].count('INTJ'), axis=1)

In [24]:
essay_data

Unnamed: 0,essay,word_count,matches,corrections,corrected,tokens,lemma,pos,sents,ner,...,adj,pron,verb,cconj,adv,det,propn,num,part,intj
0,"In “Let there be dark,” Paul talks about the i...",234,"[Line 1, column 163, Rule ID: MORFOLOGIK_RULE_...",2,"In “Let there be dark,” Paul talks about the i...","[In, “, Let, there, be, dark, ,, ”, Paul, talk...","[in, "", let, there, be, dark, ,, "", Paul, talk...","[ADP, PUNCT, VERB, PRON, AUX, ADJ, PUNCT, PUNC...","[In “Let there be dark,” Paul talks about the ...","[night, 2, 400, North America, American, billi...",...,20,7,31,10,10,27,4,5,9,0


In [None]:
essay_features

In [None]:
score_pred = clf.predict(essay_features)

score_pred

In [33]:
def extract_features(essay_text):
    d = {'essay': [essay_text]}
    essay_data = pd.DataFrame(data=d)
    
    essay_data['word_count'] = essay_data['essay'].str.strip().str.split().str.len()
    
    tool = language_check.LanguageTool('en-US')

    essay_data['matches'] = essay_data['essay'].apply(lambda txt: tool.check(txt))
    essay_data['corrections'] = essay_data.apply(lambda l: len(l['matches']), axis=1)
    essay_data['corrected'] = essay_data.apply(lambda l: language_check.correct(l['essay'], l['matches']), axis=1)

    sents = []
    tokens = []
    lemma = []
    pos = []
    ner = []

    stop_words = set(STOP_WORDS)
    stop_words.update(punctuation)

    nlp = en_core_web_sm.load()

    np.warnings.filterwarnings('ignore')

    for essay in nlp.pipe(essay_data['corrected']):
        tokens.append([e.text for e in essay])
        sents.append([sent.string.strip() for sent in essay.sents])
        pos.append([e.pos_ for e in essay])
        ner.append([e.text for e in essay.ents])
        lemma.append([n.lemma_ for n in essay])

    essay_data['tokens'] = tokens
    essay_data['lemma'] = lemma
    essay_data['pos'] = pos
    essay_data['sents'] = sents
    essay_data['ner'] = ner
    
    essay_data['similarity'] = essay_data.apply(lambda row: avg_similarity(row['essay']), axis=1)
    
    essay_data['token_count'] = essay_data.apply(lambda x: len(x['tokens']), axis=1)
    essay_data['unique_token_count'] = essay_data.apply(lambda x: len(set(x['tokens'])), axis=1)
    essay_data['nostop_count'] = essay_data \
            .apply(lambda x: len([token for token in x['tokens'] if token not in stop_words]), axis=1)
    essay_data['sent_count'] = essay_data.apply(lambda x: len(x['sents']), axis=1)
    essay_data['ner_count'] = essay_data.apply(lambda x: len(x['ner']), axis=1)
    essay_data['comma'] = essay_data.apply(lambda x: x['corrected'].count(','), axis=1)
    essay_data['question'] = essay_data.apply(lambda x: x['corrected'].count('?'), axis=1)
    essay_data['exclamation'] = essay_data.apply(lambda x: x['corrected'].count('!'), axis=1)
    essay_data['quotation'] = essay_data.apply(lambda x: x['corrected'].count('"') + x['corrected'].count("'"), axis=1)
    essay_data['organization'] = essay_data.apply(lambda x: x['corrected'].count(r'@ORGANIZATION'), axis=1)
    essay_data['caps'] = essay_data.apply(lambda x: x['corrected'].count(r'@CAPS'), axis=1)
    essay_data['person'] = essay_data.apply(lambda x: x['corrected'].count(r'@PERSON'), axis=1)
    essay_data['location'] = essay_data.apply(lambda x: x['corrected'].count(r'@LOCATION'), axis=1)
    essay_data['money'] = essay_data.apply(lambda x: x['corrected'].count(r'@MONEY'), axis=1)
    essay_data['time'] = essay_data.apply(lambda x: x['corrected'].count(r'@TIME'), axis=1)
    essay_data['date'] = essay_data.apply(lambda x: x['corrected'].count(r'@DATE'), axis=1)
    essay_data['percent'] = essay_data.apply(lambda x: x['corrected'].count(r'@PERCENT'), axis=1)
    essay_data['noun'] = essay_data.apply(lambda x: x['pos'].count('NOUN'), axis=1)
    essay_data['adj'] = essay_data.apply(lambda x: x['pos'].count('ADJ'), axis=1)
    essay_data['pron'] = essay_data.apply(lambda x: x['pos'].count('PRON'), axis=1)
    essay_data['verb'] = essay_data.apply(lambda x: x['pos'].count('VERB'), axis=1)
    essay_data['noun'] = essay_data.apply(lambda x: x['pos'].count('NOUN'), axis=1)
    essay_data['cconj'] = essay_data.apply(lambda x: x['pos'].count('CCONJ'), axis=1)
    essay_data['adv'] = essay_data.apply(lambda x: x['pos'].count('ADV'), axis=1)
    essay_data['det'] = essay_data.apply(lambda x: x['pos'].count('DET'), axis=1)
    essay_data['propn'] = essay_data.apply(lambda x: x['pos'].count('PROPN'), axis=1)
    essay_data['num'] = essay_data.apply(lambda x: x['pos'].count('NUM'), axis=1)
    essay_data['part'] = essay_data.apply(lambda x: x['pos'].count('PART'), axis=1)
    essay_data['intj'] = essay_data.apply(lambda x: x['pos'].count('INTJ'), axis=1)
    
    return essay_data

In [34]:
sample_essay2 = "My little brother is so irritating. All day long he says, “Eddie, I wonder why people can talk but animals can’t.” Or, “I wonder why the ocean looks blue.” Of course, I don’t know the answers, but I don’t let him know that. I just make up reasonable explanations, and he accepts them as if I’m the smartest person in the world. Before I answer one of his questions, I usually tell him that he’s pretty stupid and asks too many questions. Well, yesterday we both got our report cards. I got B’s and C’s, and he got straight A’s. Under the “Comments” section on my report card, it said, “Eddie would be getting better grades if he asked more questions.” Of course, on my brother’s report card, it said just the opposite. To make things worse, my brother squawked all day about how I was so stupid for not asking questions! I just sighed and told him he was right—I wouldn't make fun of him anymore for asking so many questions. Yes, I learned a lesson from my little brother: Never be afraid to ask questions, and NEVER be afraid to wonder why."

sample_essay2

"My little brother is so irritating. All day long he says, “Eddie, I wonder why people can talk but animals can’t.” Or, “I wonder why the ocean looks blue.” Of course, I don’t know the answers, but I don’t let him know that. I just make up reasonable explanations, and he accepts them as if I’m the smartest person in the world. Before I answer one of his questions, I usually tell him that he’s pretty stupid and asks too many questions. Well, yesterday we both got our report cards. I got B’s and C’s, and he got straight A’s. Under the “Comments” section on my report card, it said, “Eddie would be getting better grades if he asked more questions.” Of course, on my brother’s report card, it said just the opposite. To make things worse, my brother squawked all day about how I was so stupid for not asking questions! I just sighed and told him he was right—I wouldn't make fun of him anymore for asking so many questions. Yes, I learned a lesson from my little brother: Never be afraid to ask qu

In [35]:
sample_essay

'In “Let there be dark,” Paul talks about the importance of darkness. Darkness is essential to humans. He states, “Our bodies need darkness to produce the hormone melatonin, which keeps certain cancers from developing, and our bodies need darkness for sleep, sleep. Sleep disorders have been linked to diabetes, obesity, cardiovascular disease and depression and recent research suggests are main cause of “short sleep” is “long light.” Whether we work at night or simply take our tablets, notebooks and smartphones to bed, there isn’t a place for this much artificial light in our lives.” (2). Here, He talks about the importance of darkness to humans. Humans need darkness to sleep in order to be healthy. Animals also need darkness. He states, “The rest of the world depends on darkness as well, including nocturnal and crepuscular species of birds, insects, mammals, fish and reptiles. Some examples are well known—the 400 species of birds that migrate at night in North America, the sea turtles 

In [36]:
essay1 = extract_features(sample_essay)
essay1_scaled = sc.transform(essay1[all_features])
e_pred1 = clf.predict(essay1_scaled)

e_pred1

array([73.46911379])

In [37]:
essay1[all_features]

Unnamed: 0,word_count,corrections,similarity,token_count,unique_token_count,nostop_count,sent_count,ner_count,comma,question,...,adj,pron,verb,cconj,adv,det,propn,num,part,intj
0,234,2,0.799273,294,161,156,16,11,22,0,...,20,7,31,10,10,27,4,5,9,0


In [38]:
essay2 = extract_features(sample_essay2)
essay2_scaled = sc.transform(essay2[all_features])
e_pred2 = clf.predict(essay2_scaled)

e_pred2

array([67.83724891])

In [39]:
essay2[all_features]

Unnamed: 0,word_count,corrections,similarity,token_count,unique_token_count,nostop_count,sent_count,ner_count,comma,question,...,adj,pron,verb,cconj,adv,det,propn,num,part,intj
0,194,0,0.854395,244,126,119,14,5,16,0,...,15,27,37,9,22,18,6,1,9,2


In [40]:
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'H:\Tesseract\tesseract.exe'

In [41]:
image = cv2.imread("images/essay.jpg")

thresh = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)[1]

result = cv2.GaussianBlur(thresh, (5,5), 0)
result = 255 - result

data = pytesseract.image_to_string(result, lang='eng',config='--psm 6')
print(data)

E-book readers are changing the way people read, or so e-book developers hope. The main
selling point for these handheld devices, which are sort of the size of a paperback
book, is that they make books easy to access and carry. Electronic versions of printed
books can be downloaded online for a few bucks or directly from your cell phone. These
devices can store hundreds of books in memory and, with text-to-speech features, can
even read the texts. The market for e-books and e-book readers keeps expanding as a lot
of companies enter it. Online and traditional booksellers have been the first to market
e-book readers to the public, but computer companies, especially the ones already
involved in cell phone, online music, and notepad computer technology, will also enter
the market. The problem for consumers, however, is which device to choose.
Incompatibility is the norm. E-books can be read only on the devices they were intended
for. Furthermore, use is restricted by the same kind of DRM s

In [42]:
def predict_score(essay_text):
    essay = extract_features(essay_text)
    essay_scaled = sc.transform(essay[all_features])
    score_pred = clf.predict(essay_scaled)

    return score_pred[0]

In [43]:
predict_score(data)

78.8516039844114

In [44]:
import anvil.server

anvil.server.connect("7P3FVJT7THECP7ODOIFSU3CV-WQQNE63QV6PTSW4S")

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment (dev)" as SERVER


In [45]:
import anvil.media

@anvil.server.callable
def text_from_image(file):
    with anvil.media.TempFile(file) as filename:
        image = cv2.imread(filename)

    thresh = cv2.threshold(image, 150, 255, cv2.THRESH_BINARY_INV)[1]

    result = cv2.GaussianBlur(thresh, (5,5), 0)
    result = 255 - result

    data = pytesseract.image_to_string(result, lang='eng',config='--psm 6')
    return data


@anvil.server.callable
def get_score(text):
    num_score = predict_score(text)
    letter_score = 'F'
    
    if num_score >= 90:
        letter_score = 'A'
    elif num_score >= 80:
        letter_score = 'B'
    elif num_score >= 70:
        letter_score = 'C'
    elif num_score >= 60:
        letter_score = 'D'
        
    return num_score, letter_score