In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib

from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
file_name = 'eye_in_new_keywords'

In [3]:
def lemmatize_abstracts(abstracts):

    # remove missing/very short abstracts
    print('Removing missing abstracts')
    abstracts = abstracts.dropna()
    abstracts = abstracts[abstracts.str.split(r'[^a-zA-Z]+').str.len()>10]
    
    # convert to lower case and remove digits and punctuation
    print('Initial preprocessing: case, punctuation, whitespace')
    abstracts = abstracts.str.lower()
    abstracts = abstracts.str.replace(r'[^a-zA-Z]+',' ',regex=True)
    abstracts = abstracts.str.strip()
    abstracts= abstracts.str.split(' ')

    print('Lemmatizing')
    try:
        lemma = WordNetLemmatizer()
    except:
        import nltk
        nltk.download('wordnet')
        lemma = WordNetLemmatizer()

    try:
        stop_words = stopwords.words('english')
    except:
        import nltk
        nltk.download('stopwords')
        stop_words = stopwords.words('english')

    abstracts=abstracts.apply(lambda x : [lemma.lemmatize(word) for word in x if word not in stop_words])
    abstracts=abstracts.apply(lambda x : [word for word in x if len(word)>1])
    abstracts=abstracts.apply(lambda x : ' '.join(x))

    return abstracts

In [4]:
df = pd.read_pickle('data/EPMC/'+file_name+'.pkl')
df.drop_duplicates(subset='pmid',inplace=True)
df.set_index('pmid',inplace=True)
print(len(df),'rows')
df.head()

abstracts = df.loc[~df['abstractText'].isnull(),'abstractText']

del df

abstracts = lemmatize_abstracts(abstracts)

pmids = abstracts.index
abstracts.head()

409266 rows
Removing missing abstracts
Initial preprocessing: case, punctuation, whitespace
Lemmatizing


pmid
30209082    ass prevalence cause vision impairment north a...
29781739    developed country genetically inherited eye di...
30092731    systemic autoimmune disease associated ocular ...
30096011    ass prevalence ocular manifestation related di...
30270476    optical coherence tomography oct provides non ...
Name: abstractText, dtype: object

In [5]:
abstracts.to_pickle('data/EPMC/'+file_name+'_ABSTRACTS_LEMMA.pkl')


In [6]:
print('fitting countvec')
countvec = CountVectorizer(strip_accents='unicode', lowercase=True, stop_words='english', 
                           ngram_range=(1, 2), max_df=0.2, min_df=0.0005, max_features=None)

# 'fit' the vectorizer to the corpus
# this step automatically determines the vocabulary
countvec.fit(abstracts)

print('size vocab:',len(countvec.vocabulary_ ))
print('no. stop words:',len(countvec.stop_words_))

print('saving vectoriser')
joblib.dump(countvec, 'data/CountVec.joblib') 

features = countvec.get_feature_names()

# then 'transform' the corpus
# this computes the term frequency vectors
print('transforming abstracts')
countvec_vectors = countvec.transform(abstracts)

del countvec

#print('saving transformed abstract vectors')
#countvec_vectors = pd.DataFrame(countvec_vectors, index=pmids,columns=features)
#countvec_vectors.to_pickle('data/EPMC/'+file_name+'_COUNT_VECTORS.pkl')

print('done')

fitting countvec
size vocab: 27509
no. stop words: 10227181
saving vectoriser
transforming abstracts
done


In [None]:
df = np.sum(countvec_vectors>1,axis=0)
df = np.squeeze(np.array(df))
df = pd.Series(df,index=features)
df.describe()

In [None]:
df.sort_values(ascending=False).head(20)

In [None]:
sum(df>1)

In [None]:
df_frac = df/len(abstracts)

In [None]:
vocabulary = countvec.vocabulary_
idf = countvec.idf_
pd.Series(idf).describe()

In [None]:
df = ((1+len(abstracts))/np.exp(idf-1))-1
df = pd.Series(df,index=features)
df.describe()

In [None]:
display(df.sort_values(ascending=False).head(40)/len(abstracts))

In [None]:
df_frac = df/len(abstracts)
df_frac.describe()

In [None]:
df_bins = pd.cut(df,[0,1.5,10.5,100.5,1000.5,10000.5,100000.5,1000000.5])
df_bins.value_counts().plot.bar()

In [None]:
df[df.index.str.len()<3].sort_values(ascending=False)

In [None]:
sum((df_frac>0.005) & (df_frac<0.2))