In [1]:
import spacy
import os
import sklearn
import numpy as np
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 15000000
from spacy.lang.en.stop_words import STOP_WORDS
import readability
import json
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import textacy
from sklearn.metrics import confusion_matrix
import gensim
from numpy import dot
from numpy.linalg import norm

In [2]:
with open("./summary_quality/train_data.json",'r') as fin:
    train_content = json.load(fin)

In [3]:
with open("./summary_quality/test_data.json",'r') as fin:
    test_content = json.load(fin)

In [4]:
def getFeature1(wordList):
    unigram = {}
    for word in wordList:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            c = unigram.get(word,0)
            unigram[word] = c+1
    return max(unigram.values())

In [5]:
def getFeature2(wordList):
    bigrams = {}
    for i in range(0,len(wordList) - 1):
        bigram = (wordList[i],wordList[i+1])
        c = bigrams.get(bigram,0)
        bigrams[bigram] = c + 1
    return max(bigrams.values())

In [10]:
word2VecModel = gensim.models.KeyedVectors.load_word2vec_format('../Q3/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [6]:
def getFeature3(doc):
    cosineList = []
    sentences = []
    for sent in doc.sents:
        sentences.append(sent.text.lower())
    if(len(sentences) == 1):
        return 0
    for i in range(0,len(sentences)-1):
        numS1 = 0
        vw1 = np.zeros(300)
        numS2 = 0
        vw2 = np.zeros(300)
        for word in sentences[i].split(" "):
            if word in word2VecModel:
                numS1 += 1
                vw1 = vw1 + word2VecModel[word]
        vw1 = vw1/numS1
        for word in sentences[i+1].split(" "):
            if word in word2VecModel:
                numS2 += 1
                vw2 = vw2 + word2VecModel[word]
        vw2 = vw2/numS2
        cos_sim = dot(vw1, vw2)/(norm(vw1)*norm(vw2))
        cosineList.append(cos_sim)
    return max([x for x in cosineList if ~np.isnan(x)])

In [7]:
def getFeature4(wordList):
    trigrams = {}
    for i in range(0,len(wordList) - 2):
        trigram = (wordList[i],wordList[i+1],wordList[i+2])
        c = trigrams.get(trigram,0)
        trigrams[trigram] = c + 1
    return max(trigrams.values())

In [8]:
def getFeature5(doc):
    scores = readability.getmeasures(doc.text, lang='en')
    return scores['sentence info']['wordtypes'] / scores['sentence info']['words']

Run Cell for 5 features

In [9]:
docFeature = OrderedDict()
for file in os.listdir("./summary_quality/summaries/"):
    f = open("./summary_quality/summaries/"+file, encoding = "ISO-8859-1")
    text = f.read()
    doc = nlp(text)
    wordList=[]
    for token in doc:
        if token.pos_ != "PUNCT" and token.pos_ != "SPACE" and token.pos_ != "SYM":
            wordList.append(token.text.lower())
    t = (getFeature1(wordList),getFeature2(wordList),getFeature3(doc),getFeature4(wordList),getFeature5(doc))
    docFeature[file] = t
    f.close()

    #create train and test data
feature_X_Y = OrderedDict()
for k,v in sorted(train_content.items(), key=lambda x: x):
    feature_X_Y[k] = (docFeature[k],v['nonredundancy'])
test_X_Y = OrderedDict()
for k,v in sorted(test_content.items(), key=lambda x: x):
    test_X_Y[k] = (docFeature[k],v['nonredundancy'])
test_x = []
test_y = []
for k,v in test_X_Y.items():
    test_x.append([float(v[0][0]),float(v[0][1]),float(v[0][2]),float(v[0][3]),float(v[0][4])])
    test_y.append(float(v[1]))
    train_x = []
train_y = []
for k,v in feature_X_Y.items():
    train_x.append([float(v[0][0]),float(v[0][1]),float(v[0][2]),float(v[0][3]),float(v[0][4])])
    train_y.append(float(v[1]))

NameError: name 'word2VecModel' is not defined

Run cell for 3 features

In [None]:
docFeature = OrderedDict()
for file in os.listdir("./summary_quality/summaries/"):
    f = open("./summary_quality/summaries/"+file, encoding = "ISO-8859-1")
    text = f.read()
    doc = nlp(text)
    wordList=[]
    for token in doc:
        if token.pos_ != "PUNCT" and token.pos_ != "SPACE" and token.pos_ != "SYM":
            wordList.append(token.text.lower())
    t = (getFeature1(wordList),getFeature2(wordList),getFeature3(doc))
    docFeature[file] = t
    f.close()

    #create train and test data
feature_X_Y = OrderedDict()
for k,v in sorted(train_content.items(), key=lambda x: x):
    feature_X_Y[k] = (docFeature[k],v['nonredundancy'])
test_X_Y = OrderedDict()
for k,v in sorted(test_content.items(), key=lambda x: x):
    test_X_Y[k] = (docFeature[k],v['nonredundancy'])
test_x = []
test_y = []
for k,v in test_X_Y.items():
    test_x.append([float(v[0][0]),float(v[0][1]),float(v[0][2])])
    test_y.append(float(v[1]))
    train_x = []
train_y = []
for k,v in feature_X_Y.items():
    train_x.append([float(v[0][0]),float(v[0][1]),float(v[0][2])])
    train_y.append(float(v[1]))

In [None]:
regressor = SVR(kernel='rbf')
regressor.fit(train_x,train_y)#5 Predicting a new result
y_pred = regressor.predict(test_x)
print("SVR MSE ",mean_squared_error(test_y,y_pred))
print("SVR Pearson ",pearsonr(test_y,y_pred))