In [1]:
import pandas as pd
import numpy as np
import re
from nltk.stem import PorterStemmer, SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.sparse import coo_array, csr_array, csc_array, csr_matrix, coo_matrix, csc_matrix
import scipy.sparse

from sklearn.preprocessing import normalize

pd.set_option('display.max_rows', 500)

In [2]:
def clean_junk(txt):
    # html
    txt = txt.str.replace(r'<[^<>]*>', '', regex=True)

    # paper structure
    txt = txt.str.replace(r'Background(?!\ )', '', regex=True)
    txt = txt.str.replace(r'Method(?!\ )', '', regex=True)
    txt = txt.str.replace(r'Title(?!\ )', '', regex=True)
    return txt 

def replace_abvs(abstract, abv_map):
        if not isinstance(abv_map, dict):
            return abstract

        for key in abv_map.keys():
            abstract = abstract.replace(key, abv_map[key])
        return abstract

def replace_species_abbreviations(txt):
    # find all abbreviations of the form ' P. suffix' => 'Prefix Suffix' (note leading whitespace)
    abvs = txt.str.findall(r'\s([A-Z]\.\ [a-z]\w*)')
    abvs = abvs.apply(lambda x: x if x != [] else np.NaN).dropna()

    # make new dataframe of abv and abstract
    abv_df = pd.concat([txt, abvs], axis=1).dropna()
    
    
    abv_df.columns.values[0] = "abstract"
    abv_df.columns.values[1] = "abv"
    
    
    abv_df = abv_df.explode('abv').drop_duplicates()

    # split by prefix and suffix
    split_abv = abv_df.abv.str.split('. ')
    
    abv_df['prefix'] = split_abv.apply(lambda x: x[0])
    abv_df['suffix'] = split_abv.apply(lambda x: x[-1])

    # match by suffix
    # drop all abbreviations without exactly one unique full prefix
    abv_df['matches'] = abv_df.apply(lambda x: set(re.findall(f'(\w+)\s+{x.suffix}', x.abstract)), axis=1)
    abv_df = abv_df[abv_df.apply(lambda x: len(x.matches) == 1, axis=1)]
    abv_df['matches'] = abv_df.matches.apply(lambda x: list(x)[0])

    # filter out any matches that don't have same starting letter as prefix
    abv_df = abv_df[abv_df.matches.str[0] == abv_df.prefix]

    # unabbreviate
    abv_df['unabbv'] = abv_df.matches + ' '+ abv_df.suffix
    abv_df['connected'] = abv_df.matches + '_' + abv_df.suffix
    abv_df = abv_df.drop(columns=['prefix', 'suffix', 'matches'])
    abv_df

    abstract_group = abv_df.groupby(abv_df.index)
    abv_mappings = abstract_group.apply(lambda x: x.set_index('abv').to_dict()['unabbv'])
    connect_mappings = abstract_group.apply(lambda x: x.set_index('unabbv').to_dict()['connected'])
    abstracts = abstract_group.apply(lambda x: x.abstract.iloc[0])
    abv_map = pd.concat([txt, abv_mappings, connect_mappings], axis=1, keys=['abstract', 'abv_map', 'con_map'])

    removed_abvs = abv_map.apply(lambda x: replace_abvs(x.abstract, x.abv_map), axis=1)
    abv_map['abstract'] = removed_abvs
    connected_abvs = abv_map.apply(lambda x: replace_abvs(x.abstract, x.con_map), axis=1)
    
    return connected_abvs

def txt_to_words(txt):
    return txt.str.split('[\W+|-]').explode()

def words_to_txt(words):
    return words.groupby(level=0).apply(' '.join)

def make_words_df(txt):
    txt = txt.dropna()
    txt = clean_junk(txt)
    txt = replace_species_abbreviations(txt)
    stemmer = PorterStemmer()
    words = txt_to_words(txt)
    unique_words = words.dropna().unique()
    df = pd.DataFrame(unique_words, columns=['plain'])
    df['stem'] = df.plain.apply(stemmer.stem)
    stem_map = df.set_index('plain').stem.to_dict()
    sentences = txt.str.split('\.').explode().dropna()
    words_df = sentences.str.split('\ +').reset_index().explode(column=0).reset_index()
    words_df = words_df.rename(columns={'level_0': 'sentence', 'index': 'doc', 0: 'words'})
    words_df['stems'] = words_df['words'].map(stem_map, 'ignore')
    words_df = words_df.dropna()
    words_df = words_df[words_df.stems.str.contains('^[a-zA-Z]+')]
    return words_df

def get_sentences(words_df):
    sentences = words_df.groupby('sentence').stems.apply(' '.join)
    return sentences

def get_docs(words_df):
    docs = words_df.groupby('doc').stems.apply(' '.join)
    return docs

def calc_term_entropy(tf_matrix):
    H = np.zeros(tf_matrix.shape[1])
    
    tf_matrix = coo_array(tf_matrix) # row col access
    tf_wc = tf_matrix.sum(axis=0) # TF(w, C)
    
    for d, w, tf in zip(tf_matrix.row, tf_matrix.col, tf_matrix.data):
        p_dw = tf / tf_wc[w]
        H[w] -= p_dw * np.log2(p_dw)
    return H

def get_stopwords(tf_matrix, vocabulary, random_rounds=10):
    entropy = calc_term_entropy(tf_docs)
    entropy = pd.Series(entropy, vocabulary, name='entropy', dtype='float64')
    
    null_entropy = np.zeros(vocabulary.shape[0])

    for i in range(0, random_rounds):    
        words_df['null'] = words_df.stems.sample(frac=1).to_numpy()
        null_docs = words_df.groupby('doc').null.apply(' '.join) 
        tf_null      = tf_vectorizer.transform(null_docs)
        null_entropy += calc_term_entropy(tf_null)

    null_entropy = null_entropy / random_rounds
    
    stopwords = pd.DataFrame(entropy, columns=['entropy'])
    stopwords['tf'] = words_df.stems.value_counts()
    stopwords = stopwords[['tf', 'entropy']]
    stopwords[f'null'] = null_entropy
    stopwords['infor'] = null_entropy - stopwords.entropy
    
def drop_stopwords(words_df, stopword_list):
    stopwords = set(stopwords_list)
    return words_df[~words_df.stems.isin(stopwords)]

In [3]:
citations = pd.read_csv('all_soybean_citations.csv')
data = citations.copy()
txt = data.abstract.dropna()

In [4]:
txt = clean_junk(txt)
txt = replace_species_abbreviations(txt)

In [5]:
txt = txt.str.replace('(\d+)\.(\d+)', '\\1_\\2', regex=True)
txt = txt.str.replace('([^\w| ]) *', ' \\1 ', regex=True)
txt

0       Heat stress driven by global warming has affec...
1       The combination of apomixis and hybrid product...
2       The zinc deficiency response in Arabidopsis_th...
3       E3 - ubiquitin ligases are known to confer abi...
4       Pod borer , Helicoverpa armigera , a polyphagu...
                              ...                        
5395    Transposable elements are the most abundant co...
5397    The Soybean Consensus Map 4_0 facilitated the ...
5398    Soybean somatic embryos have attracted attenti...
5399    The number and distribution of branches in soy...
5400    The generation of useful mutant alleles of spe...
Length: 5293, dtype: object

In [6]:
stemmer = PorterStemmer()
words = txt_to_words(txt)
unique_words = words.dropna().unique()
df = pd.DataFrame(unique_words, columns=['plain'])
df['stem'] = df.plain.apply(stemmer.stem)
stem_map = df.set_index('plain').stem.to_dict()

In [7]:
sentences = txt.str.split('\.').explode().dropna()
words_df = sentences.str.split('\ +').reset_index().explode(column=0).reset_index()
words_df = words_df.rename(columns={'level_0': 'sentence', 'index': 'doc', 0: 'words'})
words_df['stems'] = words_df['words'].map(stem_map, 'ignore')
words_df = words_df.dropna()
words_df = words_df[words_df.stems.str.contains('^[a-zA-Z]+')]

words_df

Unnamed: 0,sentence,doc,words,stems
0,0,0,Heat,heat
1,0,0,stress,stress
2,0,0,driven,driven
3,0,0,by,by
4,0,0,global,global
...,...,...,...,...
1441576,59293,5400,genetic,genet
1441577,59293,5400,diversity,divers
1441578,59293,5400,of,of
1441579,59293,5400,polyploid,polyploid


In [8]:
sentences = words_df.groupby('sentence').stems.apply(' '.join)
docs = words_df.groupby('doc').stems.apply(' '.join)

In [25]:
tf_vectorizer = CountVectorizer(max_df=1.,
                                min_df=3,
                                max_features=None,
                                ngram_range=(1, 1), 
                                stop_words=None
                                )

tf_vectorizer.fit(docs)

tf_sentences = tf_vectorizer.transform(sentences)
tf_docs      = tf_vectorizer.transform(docs)


vocabulary   = tf_vectorizer.get_feature_names_out()

In [26]:
# sentences_df = pd.DataFrame.sparse.from_spmatrix(tf_sentences, columns=vocabulary)
# docs_df = pd.DataFrame.sparse.from_spmatrix(tf_docs, columns=vocabulary)

tf_sentences = coo_array(tf_sentences)
tf_docs = coo_array(tf_docs)

$$w = \text{term}$$
$$d = \text{document}$$
$$C = \text{corpus}$$

$$k = \text{randomization rounds}$$

Shannon Word Entropy formula:
$$p(d|w) = \frac{TF(w, d)}{TF(w, C)}$$

$$H(w, C) = - \sum_{d}p(d|w)\log(p(d|w))$$
    

My formula (Modified Infor):
$$S(C) = \text{randomize}(C)$$

$$\overline{H}(w, C) = \sum_{k}\frac{H(w, S(C))}{k}$$

$$r(w, C) = \overline{H}(w, C) - H(w, C)$$

$$\hat{r}(w, C) = \text{normalize}(r(w,C))$$

$$I(w, C) = H(w, C)(1 - \hat{r}(w, C))$$

$$d(w, C) = \frac{rank(I(w, C)) - rank(TF(w, C))}{|\{w\}|}$$

$$SE(w, C) = I(w, C) + d(w) * log_2(|\{w\}|)$$

In [27]:
def calc_term_entropy(tf_matrix):
    H = np.zeros(tf_matrix.shape[1])
    tf_matrix = coo_array(tf_matrix) # row col access
    tf_wc = tf_matrix.sum(axis=0) # TF(w, C)
    for d, w, tf in zip(tf_matrix.row, tf_matrix.col, tf_matrix.data):
        p_dw = tf / tf_wc[w]
        H[w] -= p_dw * np.log2(p_dw)
    return H

In [28]:
def neighboring_tf(m):
    m = csr_matrix(m)
    bool_m = m > 0
    sums = (bool_m.transpose() * m)
    m = csr_matrix(m)
    bool_m = m > 0
    sums = (bool_m.transpose() * m)
    inv_totals = csr_matrix(1 / m.sum(axis=0))
    norms = sums.multiply(inv_totals)
    norms.data = (1 / norms.data)
    return norms.sum(axis=0)

In [29]:
entropy = calc_term_entropy(tf_docs)
entropy = pd.Series(entropy, vocabulary, name='entropy', dtype='float64')

In [41]:
null_entropy = np.zeros(vocabulary.shape[0])
random_rounds = 12

for i in range(0, random_rounds):    
    words_df['null'] = words_df.stems.sample(frac=1).to_numpy()
    null_docs = words_df.groupby('doc').null.apply(' '.join) 
    tf_null      = tf_vectorizer.transform(null_docs)
    null_entropy += calc_term_entropy(tf_null)
    
null_entropy = null_entropy / random_rounds

[2.5849625 0.        0.        ... 1.        2.        3.169925 ]
[2.5849625 0.        0.        ... 1.        2.        3.169925 ]
[2.5849625 0.        0.        ... 1.        2.        3.169925 ]
[2.5849625 0.        0.        ... 1.        2.        3.169925 ]
[2.5849625 0.        0.        ... 1.        2.        3.169925 ]
[2.5849625 0.        0.        ... 1.        2.        3.169925 ]
[2.5849625 0.        0.        ... 1.        2.        3.169925 ]
[2.5849625 0.        0.        ... 1.        2.        3.169925 ]
[2.5849625 0.        0.        ... 1.        2.        3.169925 ]
[2.5849625 0.        0.        ... 1.        2.        3.169925 ]
[2.5849625 0.        0.        ... 1.        2.        3.169925 ]
[2.5849625 0.        0.        ... 1.        2.        3.169925 ]


In [31]:
stopwords = pd.DataFrame(entropy, columns=['entropy'])
stopwords['tf'] = words_df.stems.value_counts()
stopwords = stopwords[['tf', 'entropy']]
stopwords[f'null'] = null_entropy
stopwords['infor'] = null_entropy - stopwords.entropy

In [32]:
m = np.array([[4, 0, 0],
              [1, 8, 0],
              [0, 1, 1],
              [0, 2, 3],
              [4, 0, 0]])

m = csr_matrix(m)
bool_m = m > 0
sums = (bool_m.transpose() * m)
totals = csr_matrix(1 / m.sum(axis=0))
norms = sums.multiply(totals)
corr = (norms.sum(axis=1) / bool_m.transpose().sum(axis=1)).A.flatten()

In [33]:
norms.A

array([[1.        , 0.72727273, 0.        ],
       [0.11111111, 1.        , 1.        ],
       [0.        , 0.27272727, 1.        ]])

In [34]:
vals = neighboring_tf(tf_sentences).A.flatten()
stopwords['corr'] = vals

In [35]:
stopwords

Unnamed: 0,tf,entropy,null,infor,corr
a0,6,1.792481,2.584963,0.792481,470.900000
a02,1,0.000000,0.000000,0.000000,24.000000
a03,1,0.000000,0.000000,0.000000,24.000000
a04,1,0.000000,0.000000,0.000000,24.000000
a05,1,0.000000,0.000000,0.000000,24.000000
...,...,...,...,...,...
zyd6,4,0.000000,2.000000,2.000000,165.000000
zyd7,2,0.000000,1.000000,1.000000,40.000000
zygomorph,2,1.000000,1.000000,0.000000,125.000000
zygomorphi,4,1.500000,2.000000,0.500000,372.666667


In [36]:
infor = (stopwords.null - stopwords.entropy)
infor_norm = 1 - (infor - infor.min()) / (infor.max() - infor.min())
stopwords['experimental'] = stopwords.entropy * infor_norm

In [37]:
n = 1000
i_thresh = 1

top_tf = stopwords.tf.sort_values(ascending=False)[:n]
top_entropy = stopwords.entropy.sort_values(ascending=False).astype('float16')[:n]
top_infor = stopwords.assign(intropy= .5 * stopwords.entropy // 1, abs_infor=abs(stopwords.infor)
                             ).sort_values(by=['intropy', 'infor'], ascending=(False,True)
                             ).infor.astype('float16')[:n]
top_infor_sorted = stopwords.infor.sort_values().astype('float16')[:n]
top_post_tf = stopwords[abs(stopwords.infor) > i_thresh].sort_values('tf', ascending=False)[:n].tf
top_corr = stopwords['corr'].sort_values(ascending=False).astype('float16')[:n]
top_null = stopwords.null.sort_values(ascending=False).astype('float16')[:n]
top_experimental = stopwords.experimental.sort_values(ascending=False).astype('float16')[:n * 4]


In [38]:
top_tf.to_csv('stopwords/tf.tsv', sep='\t')
top_entropy.to_csv('stopwords/entropy.tsv', sep='\t')
top_infor.to_csv('stopwords/infor.tsv', sep='\t')
top_infor_sorted.to_csv('stopwords/infor_sorted.tsv', sep='\t')
top_post_tf.to_csv('stopwords/post.tsv', sep='\t')
top_corr.to_csv('stopwords/corr.tsv', sep='\t')
top_null.to_csv('stopwords/null.tsv', sep='\t')
top_experimental.to_csv('stopwords/experimental.tsv', sep='\t')

In [39]:
top_corr = top_corr / top_corr.max()

In [40]:
infor = (stopwords.null - stopwords.entropy)
infor_norm = 1 - (infor - infor.min()) / (infor.max() - infor.min())
# experimental = (stopwords.entropy * 2 // 1) + infor_norm
# experimental = stopwords.null + stopwords.entropy

# experimental = stopwords[stopwords.entropy > (stopwords.entropy.max() * .75)].infor

experimental = 

SyntaxError: invalid syntax (3268598836.py, line 8)

In [None]:
ranks = pd.DataFrame()
ranks['tf'] = stopwords.tf
ranks['entropy_rank'] = stopwords.entropy.rank(method='first', ascending=False).astype('int')
ranks['experimental_rank'] = stopwords.experimental.rank(method='first', ascending=False).astype('int')
ranks['tf_rank'] = stopwords.tf.rank(method='first', ascending=False).astype('int')
ranks = ranks.sort_values('tf', ascending=False)

In [None]:
ranks.to_html('ranks.html')

In [None]:
ranks['downshift'] = (ranks.experimental_rank - ranks.entropy_rank)

In [None]:
ranks[ranks.entropy_rank < ranks.shape[0]].sort_values('downshift', ascending=False)

In [None]:
downshift

In [None]:
stopwords.experimental

In [None]:
ranks[ranks.tf_rank < ranks.shape[0] * .2].sort_values('downshift', ascending=True)[:100]

In [None]:
ds = ranks.downshift

ds_norm = 2 * (ds - ds.min()) / (ds.max() - ds.min()) - 1

In [None]:
experimental - ds_norm