In [6]:
import spacy
import os
import sklearn
import numpy as np
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 15000000
from spacy.lang.en.stop_words import STOP_WORDS
import readability
import json
from collections import OrderedDict
from sklearn.metrics import classification_report
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import textacy

In [7]:
with open("./summary_quality/train_data.json",'r') as fin:
    train_content = json.load(fin)

In [8]:
with open("./summary_quality/test_data.json",'r') as fin:
    test_content = json.load(fin)

In [9]:
def getFeature1(wordList):
    count = 0
    i = 0
    while i < len(wordList) - 1:
        j = i+1
        if wordList[i] == wordList[j]:
            count += 1
            while j < len(wordList)-1 and wordList[i] == wordList[j]:
                j+=1
        i = j
    return count

In [10]:
def getFeature2(wordList):
    count = 0
    i = 0
    while i < len(wordList) - 3:
        j = i+2
        if wordList[i+1] != wordList[i] and wordList[i] == wordList[j] and wordList[i+1] == wordList[j+1]:
            count += 1
            while j < len(wordList)-2 and  wordList[i+1] != wordList[i] and wordList[i] == wordList[j] and wordList[i+1] == wordList[j+1]:
                j+=2
            i = j      
        else:
            i += 1
    return count

In [11]:
def getFeature3(doc):
    fletcherScores = []
    for sent in doc.sents:
        try:
            scores = readability.getmeasures(sent.text, lang='en')
            fletcherScores.append(scores['readability grades']['FleschReadingEase'])
        except ValueError:
            pass
    return min(fletcherScores)

In [12]:
def getFeature4(doc):
    sva_count = 0
    for i,sent in enumerate(doc.sents):
        text_ext = textacy.extract.subject_verb_object_triples(sent)
        count = 0
        for t in text_ext:
            count += 1
        sva_count += 1 if count > 0 else 0
    return sva_count/(i+1)

In [13]:
def getFeature5(doc):
    scores = readability.getmeasures(doc.text, lang='en')
    return scores['sentence info']['words_per_sentence']
        

Run cell for three features

In [14]:
docFeature = OrderedDict()
for file in os.listdir("./summary_quality/summaries/"):
    f = open("./summary_quality/summaries/"+file, encoding = "ISO-8859-1")
    text = f.read()
    doc = nlp(text)
    wordList=[]
    for token in doc:
        lexeme = nlp.vocab[token.text]
        if lexeme.is_stop == False and token.pos_ != "PUNCT" and token.pos_ != "SPACE" and token.pos_ != "SYM":
            wordList.append(token.text.lower())
    t = (getFeature1(wordList),getFeature2(wordList),getFeature3(doc))
    docFeature[file] = t
    f.close()

In [16]:
#create train and test data
feature_X_Y = OrderedDict()
for k,v in sorted(train_content.items(), key=lambda x: x):
    feature_X_Y[k] = (docFeature[k],v['grammaticality'])
test_X_Y = OrderedDict()
for k,v in sorted(test_content.items(), key=lambda x: x):
    test_X_Y[k] = (docFeature[k],v['grammaticality'])
test_x = []
test_y = []
for k,v in test_X_Y.items():
    test_x.append([float(v[0][0]),float(v[0][1]),float(v[0][2])])
    test_y.append(float(v[1]))
    train_x = []
train_y = []
for k,v in feature_X_Y.items():
    train_x.append([float(v[0][0]),float(v[0][1]),float(v[0][2])])
    train_y.append(float(v[1]))

Run Cell for 5 features

In [18]:
docFeature = OrderedDict()
for file in os.listdir("./summary_quality/summaries/"):
    f = open("./summary_quality/summaries/"+file, encoding = "ISO-8859-1")
    text = f.read()
    doc = nlp(text)
    wordList=[]
    for token in doc:
        lexeme = nlp.vocab[token.text]
        if lexeme.is_stop == False and token.pos_ != "PUNCT" and token.pos_ != "SPACE" and token.pos_ != "SYM":
            wordList.append(token.text.lower())
    t = (getFeature1(wordList),getFeature2(wordList),getFeature3(doc),getFeature4(doc),getFeature5(doc))
    docFeature[file] = t
    f.close()

In [19]:
#create train and test data
feature_X_Y = OrderedDict()
for k,v in sorted(train_content.items(), key=lambda x: x):
    feature_X_Y[k] = (docFeature[k],v['grammaticality'])
test_X_Y = OrderedDict()
for k,v in sorted(test_content.items(), key=lambda x: x):
    test_X_Y[k] = (docFeature[k],v['grammaticality'])
test_x = []
test_y = []
for k,v in test_X_Y.items():
    test_x.append([float(v[0][0]),float(v[0][1]),float(v[0][2]),float(v[0][3]),float(v[0][4])])
    test_y.append(float(v[1]))
    train_x = []
train_y = []
for k,v in feature_X_Y.items():
    train_x.append([float(v[0][0]),float(v[0][1]),float(v[0][2]),float(v[0][3]),float(v[0][4])])
    train_y.append(float(v[1]))

Run cell below to fit and show the results

In [20]:
regressor = SVR(kernel='rbf')
regressor.fit(train_x,train_y)
y_pred = regressor.predict(test_x)
print(mean_squared_error(test_y,y_pred))
print(pearsonr(test_y,y_pred))

0.6727410798116299
(0.02749712721445367, 0.7042478697495957)


