In [1]:
import re
import sys
import nltk
import string
from math import sqrt, log
from collections import defaultdict
from itertools import chain, product
from nltk import word_tokenize as tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer as stemmer
import numpy as np
from itertools import groupby
from collections import Counter
from nltk.collocations import *
from nltk.stem.porter import PorterStemmer
import pandas as pd

In [2]:
def compute_average_word_length(sentence):
    return np.mean([len(words) for words in sentence.split()])

def compute_average_sentence_length(sentence):
    sentence = re.split("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", sentence)
    return np.mean([len(words) for words in sentence])

def freq_of_words_great_sent_len(sentence):
    result = []
    avg_word_len = compute_average_word_length(sentence)
    # sentence = re.split("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", sentence)
    sentence = Counter(sentence.split())
    for key, value in sentence.items():
        if len(key) > avg_word_len:
            result.append(value)
#             print (key, value)
    return sum(result)

def tokenize(sentence):
    return re.split(r"[^0-9A-Za-z\-'_]+", sentence)

def compute_yules_k_for_text(sentence):
    tokens = tokenize(sentence)
    counter = Counter(token.upper() for token in tokens)

    #compute number of word forms in a given sentence/text
    m1 = sum(counter.values())
    m2 = sum([frequency ** 2 for frequency in counter.values()])

    #compute yules k measure and return the value
    yules_k = 10000/((m1 * m1) / (m2 - m1))
    return yules_k


def words_in_sentence(sentence):
    w = [words.strip("0123456789!:,.?()[]{}") for words in sentence.split()]
    return filter(lambda x: len(x) > 0, w)

def compute_yules_i_for_text(sentence):
    dictionary = {}
    stemmer = PorterStemmer()

    for word in words_in_sentence(sentence):
        word = stemmer.stem(word).lower()
        try:
            dictionary[word] += 1
        except:
            dictionary[word] = 1

    m1 = float(len(dictionary))
    m2 = sum([len(list(grouped_values)) * (frequency ** 2) for frequency, grouped_values in groupby(sorted(dictionary.values()))])

    # compute yules i and return the value
    try:
        yules_i = (m1 * m1) / (m2 - m1)
        return yules_i
    except ZeroDivisionError:
        return 0

def compute_collocation_score(sentence_one, sentence_two, option):
    if option == "bi":
        tokens_for_one = nltk.wordpunct_tokenize(sentence_one)
        tokens_for_two = nltk.wordpunct_tokenize(sentence_two)
        finder_one = BigramCollocationFinder.from_words(tokens_for_one)
        finder_two = BigramCollocationFinder.from_words(tokens_for_two)
        result_one = finder_one.score_ngrams(nltk.collocations.BigramAssocMeasures().raw_freq)
        result_one = [(tuple(map(str.lower, values)), scores) for values, scores in result_one]
        result_two = finder_two.score_ngrams(nltk.collocations.BigramAssocMeasures().raw_freq)
        result_two = [(tuple(map(str.lower, values)), scores) for values, scores in result_two]
        matches = [keys for keys in set(result_one).intersection(set(result_two))]
        return len(matches)
    elif option == "tri":
        tokens_for_one = nltk.wordpunct_tokenize(sentence_one)
        tokens_for_two = nltk.wordpunct_tokenize(sentence_two)
        finder_one = TrigramCollocationFinder.from_words(tokens_for_one)
        finder_two = TrigramCollocationFinder.from_words(tokens_for_two)
        result_one = finder_one.score_ngrams(nltk.collocations.TrigramAssocMeasures().raw_freq)
        result_one = [(tuple(map(str.lower, values)), scores) for values, scores in result_one]
        result_two = finder_two.score_ngrams(nltk.collocations.TrigramAssocMeasures().raw_freq)
        result_two = [(tuple(map(str.lower, values)), scores) for values, scores in result_two]
        matches = [keys for keys in set(result_one).intersection(set(result_two))]
        return len(matches)
    else:
        return 0

def vectorize(sentence, vocabulary):
    result = [sentence.split().count(i) for i in vocabulary]
    return result

def convert_words_to_vectors(sentence):
    vectorized_sentence = []
    vocabulary = sorted(set(chain(*[words.lower().split() for words in sentence])))
    for words in sentence:
        vectorized_sentence.append((words, vectorize(words, vocabulary)))
    return vectorized_sentence, vocabulary

def dot_product_of_vectors(vector_one, vector_two):
    result = np.dot(vector_one, vector_two) / (sqrt(np.dot(vector_one, vector_one)) * sqrt(np.dot(vector_two, vector_two)))
    return result

def cosine_sim(sentence_one, sentence_two):
    sentences = [sentence_one, sentence_two]
    corpus, vocabulary = convert_words_to_vectors(sentences)
    similarity = [dot_product_of_vectors(a[1], b[1]) for a, b in product(corpus, corpus)]
    return similarity[1]

In [3]:
raw_data = pd.read_csv("reviews_scores.csv")
# raw_data = raw_data.sample(frac=0.01).reset_index(drop=True)
raw_data = raw_data.sample(frac=1).reset_index(drop=True)

def remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return content

In [4]:
raw_data["yules_i_measure_abs"] = np.nan
raw_data["yules_i_measure_abs"] = raw_data.apply(lambda x: compute_yules_i_for_text(x['abstract']), axis=1)
raw_data.head()

Unnamed: 0,abstract,review,similarity_score,yules_i_measure_abs
0,We point out important problems with the commo...,The authors propose a new measure to capture t...,0.614176,37.969388
1,Machine translation has recently achieved impr...,This paper describes an approach to train a ne...,0.581566,26.670034
2,We build on auto-encoding sequential Monte Car...,Update:\nOn further consideration (and reading...,0.505319,22.22449
3,Deep learning networks have achieved state-of-...,The paper aims at improving the accuracy of a ...,0.452746,33.58042
4,"Every second, innumerable text data, including...",* PAPER SUMMARY *\n\nThis paper proposes a sia...,0.424581,51.265823


In [5]:
raw_data["avg_word_len_abs"] = np.nan
raw_data["avg_word_len_abs"] = raw_data.apply(lambda x: compute_average_word_length(x['abstract']), axis=1)
raw_data.head()

Unnamed: 0,abstract,review,similarity_score,yules_i_measure_abs,avg_word_len_abs
0,We point out important problems with the commo...,The authors propose a new measure to capture t...,0.614176,37.969388,5.705882
1,Machine translation has recently achieved impr...,This paper describes an approach to train a ne...,0.581566,26.670034,5.513333
2,We build on auto-encoding sequential Monte Car...,Update:\nOn further consideration (and reading...,0.505319,22.22449,6.084906
3,Deep learning networks have achieved state-of-...,The paper aims at improving the accuracy of a ...,0.452746,33.58042,6.158228
4,"Every second, innumerable text data, including...",* PAPER SUMMARY *\n\nThis paper proposes a sia...,0.424581,51.265823,5.748031


In [6]:
raw_data["avg_sen_len_abs"] = np.nan
raw_data["avg_sen_len_abs"] = raw_data.apply(lambda x: compute_average_sentence_length(x['abstract']), axis=1)
raw_data.head()

Unnamed: 0,abstract,review,similarity_score,yules_i_measure_abs,avg_word_len_abs,avg_sen_len_abs
0,We point out important problems with the commo...,The authors propose a new measure to capture t...,0.614176,37.969388,5.705882,141.5
1,Machine translation has recently achieved impr...,This paper describes an approach to train a ne...,0.581566,26.670034,5.513333,161.833333
2,We build on auto-encoding sequential Monte Car...,Update:\nOn further consideration (and reading...,0.505319,22.22449,6.084906,186.75
3,Deep learning networks have achieved state-of-...,The paper aims at improving the accuracy of a ...,0.452746,33.58042,6.158228,160.571429
4,"Every second, innumerable text data, including...",* PAPER SUMMARY *\n\nThis paper proposes a sia...,0.424581,51.265823,5.748031,213.5


In [7]:
raw_data["freq_words_gt_sen_len_abs"] = np.nan
raw_data["freq_words_gt_sen_len_abs"] = raw_data.apply(lambda x: freq_of_words_great_sent_len(x['abstract']), axis=1)
raw_data.head()

Unnamed: 0,abstract,review,similarity_score,yules_i_measure_abs,avg_word_len_abs,avg_sen_len_abs,freq_words_gt_sen_len_abs
0,We point out important problems with the commo...,The authors propose a new measure to capture t...,0.614176,37.969388,5.705882,141.5,40
1,Machine translation has recently achieved impr...,This paper describes an approach to train a ne...,0.581566,26.670034,5.513333,161.833333,68
2,We build on auto-encoding sequential Monte Car...,Update:\nOn further consideration (and reading...,0.505319,22.22449,6.084906,186.75,45
3,Deep learning networks have achieved state-of-...,The paper aims at improving the accuracy of a ...,0.452746,33.58042,6.158228,160.571429,68
4,"Every second, innumerable text data, including...",* PAPER SUMMARY *\n\nThis paper proposes a sia...,0.424581,51.265823,5.748031,213.5,51


In [8]:
raw_data.to_csv("regression_data.csv", index=False)