In [1]:
import os
import pandas as pd
pd.set_option('display.max_rows', 500)
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.cluster as cl


from time import time
import numpy as np


import warnings
warnings.filterwarnings('ignore')

from scipy.sparse import coo_array, csr_array, csc_array, csr_matrix, coo_matrix, csc_matrix
import scipy.sparse

In [2]:
def clean_junk(txt):
    # html
    txt = txt.str.replace(r'<[^<>]*>', '', regex=True)

    # paper structure
    txt = txt.str.replace(r'Background(?!\ )', '', regex=True)
    txt = txt.str.replace(r'Method(?!\ )', '', regex=True)
    txt = txt.str.replace(r'Title(?!\ )', '', regex=True)
    return txt 

def replace_abvs(abstract, abv_map):
        if not isinstance(abv_map, dict):
            return abstract

        for key in abv_map.keys():
            abstract = abstract.replace(key, abv_map[key])
        return abstract

def replace_species_abbreviations(txt):
    # find all abbreviations of the form ' P. suffix' => 'Prefix Suffix' (note leading whitespace)
    abvs = txt.str.findall(r'\s([A-Z]\.\ [a-z]\w*)')
    abvs = abvs.apply(lambda x: x if x != [] else np.NaN).dropna()

    # make new dataframe of abv and abstract
    abv_df = pd.concat([txt, abvs], axis=1).dropna()

    abv_df.columns.values[0] = "abstract"
    abv_df.columns.values[1] = "abv"
    
    abv_df = abv_df.explode('abv').drop_duplicates()

    # split by prefix and suffix
    split_abv = abv_df.abv.str.split('. ')
    
    abv_df['prefix'] = split_abv.apply(lambda x: x[0])
    abv_df['suffix'] = split_abv.apply(lambda x: x[-1])

    # match by suffix
    # drop all abbreviations without exactly one unique full prefix
    abv_df['matches'] = abv_df.apply(lambda x: set(re.findall(f'(\w+)\s+{x.suffix}', x.abstract)), axis=1)
    abv_df = abv_df[abv_df.apply(lambda x: len(x.matches) == 1, axis=1)]
    abv_df['matches'] = abv_df.matches.apply(lambda x: list(x)[0])

    # filter out any matches that don't have same starting letter as prefix
    abv_df = abv_df[abv_df.matches.str[0] == abv_df.prefix]

    # unabbreviate
    abv_df['unabbv'] = abv_df.matches + ' '+ abv_df.suffix
    abv_df['connected'] = abv_df.matches + '_' + abv_df.suffix
    abv_df = abv_df.drop(columns=['prefix', 'suffix', 'matches'])
    abv_df

    abstract_group = abv_df.groupby(abv_df.index)
    abv_mappings = abstract_group.apply(lambda x: x.set_index('abv').to_dict()['unabbv'])
    connect_mappings = abstract_group.apply(lambda x: x.set_index('unabbv').to_dict()['connected'])
    abstracts = abstract_group.apply(lambda x: x.abstract.iloc[0])
    abv_map = pd.concat([txt, abv_mappings, connect_mappings], axis=1, keys=['abstract', 'abv_map', 'con_map'])

    removed_abvs = abv_map.apply(lambda x: replace_abvs(x.abstract, x.abv_map), axis=1)
    abv_map['abstract'] = removed_abvs
    connected_abvs = abv_map.apply(lambda x: replace_abvs(x.abstract, x.con_map), axis=1)
    
    return connected_abvs

def txt_to_words(txt):
    return txt.str.split('[\W+|-]').explode()

def words_to_txt(words):
    return words.groupby(level=0).apply(' '.join)

def make_words_df(txt):
    txt = txt.dropna()
    txt = clean_junk(txt)
    txt = replace_species_abbreviations(txt)
    stemmer = PorterStemmer()
    words = txt_to_words(txt)
    unique_words = words.dropna().unique()
    df = pd.DataFrame(unique_words, columns=['plain'])
    df['stem'] = df.plain.apply(stemmer.stem)
    stem_map = df.set_index('plain').stem.to_dict()
    sentences = txt.str.split('\.').explode().dropna()
    words_df = sentences.str.split('\ +').reset_index().explode(column=0).reset_index()
    words_df = words_df.rename(columns={'level_0': 'sentence', 'index': 'doc', 0: 'words'})
    words_df['stems'] = words_df['words'].map(stem_map, 'ignore')
    words_df = words_df.dropna()
    words_df = words_df[words_df.stems.str.contains('^[a-zA-Z]+')]
    return words_df

def get_sentences(words_df):
    sentences = words_df.groupby('sentence').stems.apply(' '.join)
    return sentences

def get_docs(words_df):
    docs = words_df.groupby('doc').stems.apply(' '.join)
    return docs

def calc_term_entropy(tf_matrix):
    H = np.zeros(tf_matrix.shape[1])
    
    tf_matrix = coo_array(tf_matrix) # row col access

    tf_wc = tf_matrix.sum(axis=0) # TF(w, C)
    
    for d, w, tf in zip(tf_matrix.row, tf_matrix.col, tf_matrix.data):
        p_dw = tf / tf_wc[w]
        H[w] -= p_dw * np.log2(p_dw)
    return H

def get_stopwords(tf_matrix, vocabulary, random_rounds=10):
    entropy = calc_term_entropy(tf_docs)
    entropy = pd.Series(entropy, vocabulary, name='entropy', dtype='float64')
    
    null_entropy = np.zeros(vocabulary.shape[0])

    for i in range(0, random_rounds):    
        words_df['null'] = words_df.stems.sample(frac=1).to_numpy()
        null_docs = words_df.groupby('doc').null.apply(' '.join) 
        tf_null      = tf_vectorizer.transform(null_docs)
        null_entropy += calc_term_entropy(tf_null)

    null_entropy = null_entropy / random_rounds
    
    stopwords = pd.DataFrame(entropy, columns=['entropy'])
    stopwords['tf'] = words_df.stems.value_counts()
    stopwords = stopwords[['tf', 'entropy']]
    stopwords[f'null'] = null_entropy
    stopwords['infor'] = null_entropy - stopwords.entropy
    return stopwords
    
def drop_stopwords(words_df, stopword_list):
    stopwords = set(stopword_list)
    return words_df[~words_df.stems.isin(stopwords)]

In [3]:
citations = pd.read_csv('all_soybean_citations.csv')
citations = citations[~citations.abstract.isna() & ~citations.title.isna()]

data = citations.copy()
txt = data.abstract.dropna()

In [4]:
words_df = make_words_df(txt)

In [5]:
docs = get_docs(words_df)
sentences = get_sentences(words_df)
docs

doc
0       heat stress driven by global warm ha affect fo...
1       the combin of apomixi and hybrid product is ha...
2       the zinc defici respons in arabidopsis_thalian...
3       ligas are known to confer abiot stress respons...
4       pod helicoverpa a polyphagu herbivor caus exte...
                              ...                        
5395    transpos element are the most abund compon of ...
5397    the soybean consensu map facilit the anchor of...
5398    soybean somat embryo have attract attent both ...
5399    the number and distribut of branch in soybean ...
5400    the gener of use mutant allel of specif gene w...
Name: stems, Length: 5289, dtype: object

In [6]:
tf_vectorizer = CountVectorizer(max_df=1.,
                                min_df=3,
                                max_features=None,
                                ngram_range=(1, 1), 
                                stop_words=None
                                )

tf_vectorizer.fit(docs)

tf_sentences = tf_vectorizer.transform(sentences)
tf_docs      = tf_vectorizer.transform(docs)


vocabulary   = tf_vectorizer.get_feature_names_out()

In [7]:
stopwords = get_stopwords(tf_docs, vocabulary)

In [8]:
stopwords.entropy.sort_values(ascending=False)[:200]

of               12.200256
and              12.198789
the              12.177180
in               12.145572
to               12.065804
that             11.758386
for              11.702673
with             11.699265
is               11.695638
gene             11.569620
thi              11.568888
by               11.440028
we               11.370369
were             11.346004
plant            11.333840
soybean          11.311824
as               11.278272
are              11.252206
on               11.250976
from             11.240391
these            11.207005
use              11.203264
wa               11.186198
identifi         11.153365
an               11.142512
be               11.125165
studi            11.068710
which            11.031241
express          10.975331
result           10.941949
have             10.926000
analysi          10.906385
import           10.844907
function         10.785224
develop          10.784750
provid           10.746665
genom            10.734361
s

In [9]:
i_thresh = .4

infor = stopwords.infor


# stopwords_set = set(infor[abs(infor) < i_thresh].index)
# len(stopwords_set)

stopwords_set = stopwords.entropy.sort_values(ascending=False)[:200]

In [10]:
stops = list(stopwords_set.index)
sorted(stops)

['abiot',
 'acid',
 'activ',
 'affect',
 'all',
 'also',
 'among',
 'an',
 'analys',
 'analysi',
 'analyz',
 'and',
 'approach',
 'arabidopsi',
 'are',
 'as',
 'associ',
 'at',
 'avail',
 'base',
 'be',
 'been',
 'between',
 'biolog',
 'both',
 'breed',
 'but',
 'by',
 'can',
 'candid',
 'chang',
 'character',
 'chromosom',
 'compar',
 'condit',
 'conserv',
 'contain',
 'contribut',
 'control',
 'could',
 'crop',
 'cultivar',
 'data',
 'demonstr',
 'detect',
 'develop',
 'differ',
 'differenti',
 'divers',
 'dure',
 'effect',
 'encod',
 'enhanc',
 'evolut',
 'express',
 'factor',
 'famili',
 'for',
 'found',
 'four',
 'from',
 'function',
 'further',
 'gene',
 'gener',
 'genet',
 'genom',
 'genotyp',
 'group',
 'growth',
 'ha',
 'have',
 'high',
 'higher',
 'highli',
 'identif',
 'identifi',
 'import',
 'improv',
 'in',
 'includ',
 'increas',
 'indic',
 'induc',
 'inform',
 'interact',
 'into',
 'investig',
 'involv',
 'is',
 'it',
 'key',
 'known',
 'larg',
 'legum',
 'level',
 'line'

In [11]:
'drought' in stops

False

In [12]:
filtered_docs = get_docs(drop_stopwords(words_df, stopwords_set.index))
filtered_docs

doc
0       heat driven global warm forest surviv a larger...
1       combin apomixi hybrid hail holi grail agricult...
2       zinc defici arabidopsis_thaliana basic there e...
3       ligas confer a isol possess ligas aba treatmen...
4       pod helicoverpa a polyphagu herbivor caus exte...
                              ...                        
5395    transpos element abund compon eukaryot documen...
5397    consensu facilit anchor whole joint depart den...
5398    somat embryo attract attent a zygot embryo exp...
5399    distribut branch influenc effici light util lo...
5400    mutant allel would acceler convent program com...
Name: stems, Length: 5289, dtype: object

In [43]:
def drop_stopwords_by_entropy(tf_matrix, n_stopwords):
    

In [44]:
kmeans = cl.KMeans(n_clusters=3, random_state=0).fit(tf_filtered)

In [45]:
pd.Series(kmeans.labels_).value_counts()

1    5008
0     280
2       1
dtype: int64

In [46]:
pd.DataFrame(data={'abstract': docs.reset_index(drop=True), 'label': kmeans.labels_})

Unnamed: 0,abstract,label
0,heat stress driven by global warm ha affect fo...,1
1,the combin of apomixi and hybrid product is ha...,1
2,the zinc defici respons in arabidopsis_thalian...,1
3,ligas are known to confer abiot stress respons...,1
4,pod helicoverpa a polyphagu herbivor caus exte...,1
...,...,...
5284,transpos element are the most abund compon of ...,1
5285,the soybean consensu map facilit the anchor of...,1
5286,soybean somat embryo have attract attent both ...,1
5287,the number and distribut of branch in soybean ...,1


In [53]:
results = calc_term_entropy(tf_filtered)

In [58]:
pd.Series(results, vocabulary)

a1          2.584963
a17         2.521641
a2          4.297079
a5          1.405639
aa          3.027169
              ...   
zone        4.729438
zp          2.000000
zucc        5.129283
zyd00006    2.807355
zygot       2.500000
Length: 5996, dtype: float64