In [26]:
# This Source Code Form is subject to the terms of the MPL
# License. If a copy of the same was not distributed with this
# file, You can obtain one at
# https://github.com/AkhilHector/pubundsci/blob/master/LICENSE.

import re
import sys
import nltk
import string
from math import sqrt, log
from collections import defaultdict
from itertools import chain, product
from nltk import word_tokenize as tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer as stemmer
import numpy as np
from itertools import groupby
from collections import Counter
from nltk.collocations import *
from nltk.stem.porter import PorterStemmer
import pandas as pd

"""
m1 - The number of all word forms a text consists
m2 - The sum of the products of each observed frequency to the power of two
     and the number of word types observed with that frequency
"""

def compute_average_word_length(sentence):
    return np.mean([len(words) for words in sentence.split()])

def compute_average_sentence_length(sentence):
    sentence = re.split("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", sentence)
    return np.mean([len(words) for words in sentence])

def freq_of_words_great_sent_len(sentence):
    result = []
    avg_word_len = compute_average_word_length(sentence)
    # sentence = re.split("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", sentence)
    sentence = Counter(sentence.split())
    for key, value in sentence.items():
        if len(key) > avg_word_len:
            result.append(value)
#             print (key, value)
    return sum(result)

def tokenize(sentence):
    return re.split(r"[^0-9A-Za-z\-'_]+", sentence)

def compute_yules_k_for_text(sentence):
    tokens = tokenize(sentence)
    counter = Counter(token.upper() for token in tokens)

    #compute number of word forms in a given sentence/text
    m1 = sum(counter.values())
    m2 = sum([frequency ** 2 for frequency in counter.values()])

    #compute yules k measure and return the value
    yules_k = 10000/((m1 * m1) / (m2 - m1))
    return yules_k


def words_in_sentence(sentence):
    w = [words.strip("0123456789!:,.?()[]{}") for words in sentence.split()]
    return filter(lambda x: len(x) > 0, w)

def compute_yules_i_for_text(sentence):
    dictionary = {}
    stemmer = PorterStemmer()

    for word in words_in_sentence(sentence):
        word = stemmer.stem(word).lower()
        try:
            dictionary[word] += 1
        except:
            dictionary[word] = 1

    m1 = float(len(dictionary))
    m2 = sum([len(list(grouped_values)) * (frequency ** 2) for frequency, grouped_values in groupby(sorted(dictionary.values()))])

    # compute yules i and return the value
    try:
        yules_i = (m1 * m1) / (m2 - m1)
        return yules_i
    except ZeroDivisionError:
        return 0

def compute_collocation_score(sentence_one, sentence_two, option):
    if option == "bi":
        tokens_for_one = nltk.wordpunct_tokenize(sentence_one)
        tokens_for_two = nltk.wordpunct_tokenize(sentence_two)
        finder_one = BigramCollocationFinder.from_words(tokens_for_one)
        finder_two = BigramCollocationFinder.from_words(tokens_for_two)
        result_one = finder_one.score_ngrams(nltk.collocations.BigramAssocMeasures().raw_freq)
        result_one = [(tuple(map(str.lower, values)), scores) for values, scores in result_one]
        result_two = finder_two.score_ngrams(nltk.collocations.BigramAssocMeasures().raw_freq)
        result_two = [(tuple(map(str.lower, values)), scores) for values, scores in result_two]
        matches = [keys for keys in set(result_one).intersection(set(result_two))]
        return len(matches)
    elif option == "tri":
        tokens_for_one = nltk.wordpunct_tokenize(sentence_one)
        tokens_for_two = nltk.wordpunct_tokenize(sentence_two)
        finder_one = TrigramCollocationFinder.from_words(tokens_for_one)
        finder_two = TrigramCollocationFinder.from_words(tokens_for_two)
        result_one = finder_one.score_ngrams(nltk.collocations.TrigramAssocMeasures().raw_freq)
        result_one = [(tuple(map(str.lower, values)), scores) for values, scores in result_one]
        result_two = finder_two.score_ngrams(nltk.collocations.TrigramAssocMeasures().raw_freq)
        result_two = [(tuple(map(str.lower, values)), scores) for values, scores in result_two]
        matches = [keys for keys in set(result_one).intersection(set(result_two))]
        return len(matches)
    else:
        return 0

def vectorize(sentence, vocabulary):
    result = [sentence.split().count(i) for i in vocabulary]
    return result

def convert_words_to_vectors(sentence):
    vectorized_sentence = []
    vocabulary = sorted(set(chain(*[words.lower().split() for words in sentence])))
    for words in sentence:
        vectorized_sentence.append((words, vectorize(words, vocabulary)))
    return vectorized_sentence, vocabulary

def dot_product_of_vectors(vector_one, vector_two):
    result = np.dot(vector_one, vector_two) / (sqrt(np.dot(vector_one, vector_one)) * sqrt(np.dot(vector_two, vector_two)))
    return result

def cosine_sim(sentence_one, sentence_two):
    sentences = [sentence_one, sentence_two]
    corpus, vocabulary = convert_words_to_vectors(sentences)
    similarity = [dot_product_of_vectors(a[1], b[1]) for a, b in product(corpus, corpus)]
    return similarity[1]

# if __name__ == "__main__":
s = "The smoothing span is given by f. A larger value for f will result in a smoother curve. The number of robustifying iterations is given by iter. The function will run faster with a smaller number of iterations."
a = "Although population-level genomic sequence data have been gathered extensively for humans similar data from our closest living relatives are just beginning to emerge. Examination of genomic variation within great apes offers many opportunities to increase our understanding of the forces that have differentially shaped the evolutionary history of hominid taxa. Here we expand upon the work of the Great Ape Genome Project by analyzing medium to high coverage whole-genome sequences from 14 western lowland gorillas (Gorilla gorilla gorilla) 2 eastern lowland gorillas (G. beringei graueri) and a single Cross River individual (G. gorilla diehli). We infer that the ancestors of western and eastern lowland gorillas diverged from a common ancestor approximately 261 ka and that the ancestors of the Cross River population diverged from the western lowland gorilla lineage approximately 68 ka. Using a diffusion approximation approach to model the genome-wide site frequency spectrum we infer a history of western lowland gorillas that includes an ancestral population expansion of 1.4-fold around 970 ka and a recent 5.6-fold contraction in population size 23 ka. The latter may correspond to a major reduction in African equatorial forests around the Last Glacial Maximum. We also analyze patterns of variation among western lowland gorillas to identify several genomic regions with strong signatures of recent selective sweeps. We find that processes related to taste pancreatic and saliva secretion sodium ion transmembrane transport and cardiac muscle function are overrepresented in genomic regions predicted to have experienced recent positive selection."
b = "Please note: this list is intended as a resource for those of you who are interested in responding to other CEHG members&rsquo; works for the blog. Let the blog editor (kkanagaw@stanford.edu) know if there are other publications that should be added to the"
print (compute_yules_i_for_text(s))
print (compute_collocation_score(a, b, "bi"))
print (compute_average_word_length(s))
print (compute_average_sentence_length(a))
print (freq_of_words_great_sent_len(a))
# else:
#     sys.exit(0)

13.88888888888889
0
4.52631578947
165.1
130


In [32]:
raw_data = pd.read_csv("combined_dataset_alpha.csv")
raw_data = raw_data.sample(frac=0.01).reset_index(drop=True)
raw_data = raw_data.sample(frac=1).reset_index(drop=True)

def remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return content

In [34]:
s = "my name is akhil and i am trying to compute something wonderful"
remove_stopwords(s)

[' ',
 'n',
 'e',
 ' ',
 ' ',
 'k',
 'h',
 'l',
 ' ',
 'n',
 ' ',
 ' ',
 ' ',
 'r',
 'n',
 'g',
 ' ',
 ' ',
 'c',
 'p',
 'u',
 'e',
 ' ',
 'e',
 'h',
 'n',
 'g',
 ' ',
 'w',
 'n',
 'e',
 'r',
 'f',
 'u',
 'l']

# Yules I Measure on Abstract

In [10]:
raw_data["yules_i_measure_abs"] = raw_data["altmetric_id"]
for each in range(0, len(raw_data["altmetric_id"])):
        raw_data["yules_i_measure_abs"][each] = compute_yules_i_for_text(raw_data["abstract"][each])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


# Yules I Measure on Blog

In [29]:
raw_data["yules_i_measure_blg"] = raw_data["altmetric_id"]
for each in range(0, len(raw_data["altmetric_id"])):
        raw_data["yules_i_measure_blg"][each] = compute_yules_i_for_text(raw_data["blog_post"][each])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


# Average word length on Abstract

In [12]:
raw_data["avg_word_len_abs"] = raw_data["altmetric_id"]
for each in range(0, len(raw_data["altmetric_id"])):
        raw_data["avg_word_len_abs"][each] = compute_average_word_length(raw_data["abstract"][each])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


# Average sentence length on Abstract

In [13]:
raw_data["avg_sen_len_abs"] = raw_data["altmetric_id"]
for each in range(0, len(raw_data["altmetric_id"])):
        raw_data["avg_sen_len_abs"][each] = compute_average_sentence_length(raw_data["abstract"][each])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


# Frequency of words greater than avg word length

In [15]:
raw_data["freq_words_gt_sen_len_abs"] = raw_data["altmetric_id"]
for each in range(0, len(raw_data["altmetric_id"])):
        raw_data["freq_words_gt_sen_len_abs"][each] = freq_of_words_great_sent_len(raw_data["abstract"][each])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


# Collocation Similarity between Abstract and Blog Post Summary Bigram

In [17]:
raw_data["collocation_sim_bi"] = raw_data["altmetric_id"]
for each in range(0, len(raw_data["altmetric_id"])):
        raw_data["collocation_sim_bi"][each] = compute_collocation_score(raw_data["abstract"][each], raw_data["blog_post"][each], "bi")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


# Collocation Similarity between Abstract and Blog Post Summary Trigram

In [18]:
raw_data["collocation_sim_tri"] = raw_data["altmetric_id"]
for each in range(0, len(raw_data["altmetric_id"])):
        raw_data["collocation_sim_tri"][each] = compute_collocation_score(raw_data["abstract"][each], raw_data["blog_post"][each], "tri")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [27]:
raw_data["cosine_sem_sim"] = raw_data["altmetric_id"]
for each in range(0, len(raw_data["altmetric_id"])):
        raw_data["cosine_sem_sim"][each] = cosine_sim(raw_data["abstract"][each], raw_data["blog_post"][each])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [30]:
raw_data.to_csv("regression_sample_beta.csv", sep = ',', encoding="utf-8")