# Introduction
This Notebook aims to generate frequency counts for each term based on training data.
Each term will have a count for presence or absence in a document for each class. 
We need to set the Corpus we are working from and at the end, write a dataframe to file for use in anther notebook. The next step will be to look at the enrichment of each term by class. 


### Imports
Import libraries and write settings here.

In [1]:
# Data manipulation
import pandas as pd
import numpy as np
import re
import string
import nltk
import scipy.stats as stats
from Bio import Entrez
import time
from IPython.display import clear_output
import pickle
import snowballstemmer
from collections import Counter
from sklearn import metrics
import math


import pickle
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from IPython.display import clear_output

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.metrics import classification_report as report
from sklearn.model_selection import train_test_split

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30


In [34]:
# lets import the chosen corpus into a dataframe 
df = pickle.load(open(f'../master_df_2020_07_16.p', 'rb'))
# df['reclass'].fillna(0, inplace = True, axis = 0)
df.dropna(subset=['reclass'], axis = 0, inplace = True)
len(df)

4217

In [18]:

# df.dropna(subset=['reclass'], axis = 0, inplace = True)
# print(f'There are {len(df)} records in the manually classified corpus')# i want to split the corpus into a train and test set 
# ********* NB would be good to build in K fold cross validation to this step *****************

from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=500, random_state = 23)

pickle.dump(train_df, open(f'train_df.p', 'wb'))
pickle.dump(test_df, open(f'test_df.p', 'wb'))

print(f'Length of the training set = {len(train_df)}')
print(f'Length of the test set = {len(test_df)}')

Length of the training set = 2484
Length of the test set = 500


In [35]:
df.drop(test_df.index, inplace = True, axis = 0)
len(df)

3717

In [42]:
train_df = df

In [20]:
# # lets import the chosen corpus into a dataframe 
# df = pickle.load(open(f'../class_df.p', 'rb'))
# df['reclass'].fillna(0, inplace = True, axis = 0)
# print(f'There are {len(df)} records in the manually classified corpus')

In [21]:
# # i want to split the corpus into a train and test set 
# # ********* NB would be good to build in K fold cross validation to this step *****************

# from sklearn.model_selection import train_test_split
# train_df, test_df = train_test_split(df, test_size=0.2, random_state=23)

# pickle.dump(train_df, open(f'train_df.p', 'wb'))
# pickle.dump(test_df, open(f'test_df.p', 'wb'))

# print(f'Length of the training set = {len(train_df)}')
# print(f'Length of the test set = {len(test_df)}')

In [43]:
# set which column we are looking at
# could do this for 'mh_simple_split' or 'clean_TIAB'
col = 'working_text'

In [44]:
# train_df = pickle.load(open('./train_df.p', 'rb'))

In [45]:
 # import the nltk stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

new_stops =['http',
           'doi',
           'org',
           'medrxiv',
           'manuscript',
           'preprint',
           'license',
           'creativecommons',
           'et',
           'al',
           'https',
           'ti',
           'kw',
           'ab',
           'nc',
           'nd',
           'cc',
           'yes',
           'no',
           'www']
stop_words = stop_words + new_stops

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()


# define our preprocessing to use in the Count vectoriser
def preprocessing(text):
    
    """Basic cleaning of texts."""
    
    # remove html markup
    text=re.sub("(<.*?>)","",text)
    
    #remove non-ascii and digits
    text=re.sub("(\\W|\\d)"," ",text)
    
    #remove whitespace
    text=text.strip().lower()
    
    
    
    return text


def tokenizer(text):
    
    """Tokenizing and stemming words"""
    
    # split on whitespace
    tokens = re.split("\\s+",text)
    #     # porter stemmer on each token
    #     stemmed_words=[porter_stemmer.stem(word = token) for token in tokens if (len(token) > 1) and (token not in stopwords)]
    # lemmatize each token
    lemmas = [lemmatizer.lemmatize(token) for token in tokens if (len(token) > 1) and (token not in stop_words)]
    
    return lemmas


In [46]:
# lets set up the Count Vectoriser
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(preprocessor=preprocessing,
                     tokenizer=tokenizer,
                     lowercase=True,
                     binary=True,
                     ngram_range=(1,2),
                     min_df=1,
                     max_df=0.8,
                     max_features=50000)

In [47]:
count_vec = cv.fit_transform(train_df.working_text).toarray()

In [48]:
train_df['cv'] = [vec for vec in count_vec]

In [49]:
# we want a to count the number of included documents with each term present
# then we'll want to count the number of excluded documents with the term present
# this is probably easiest by splitting the training df into an included and excluded df
# then we can apply a binary Count vecotriser to each document in each dataframe.

# we split the training set to do this quickly and simply, grouped by inclusion and exclusion column 'Human pheno'
inc_df = train_df[train_df['reclass'] == 2]
ex_df = train_df.drop(inc_df.index, axis = 0, inplace = False)


In [50]:
# make a list of TIAB or MeSH terms for the incuded documents 
# inc_corpus = inc_df[col]
# ex_corpus = ex_df[col]
# for each corpus of documents, perform binary counts for every term in the training corpus vocabulary
# each document will have a vector of len(vocab) with each terms represented with 0 or 1 for absent or present 
# inc_count_vec = cv.fit_transform(inc_corpus).toarray()
# ex_count_vec = cv.fit_transform(ex_corpus).toarray()
# for every term in the vector we can sum the counts for each class 
# this list comprehension takes every position in the list of vectors 
# (each list corresponds to a class and each document has its own vector)




inc_term_counts = [sum(t) for t in zip(*inc_df['cv'])]
inc_absent_counts = [(len(inc_df) - count) for count in inc_term_counts]
ex_term_counts = [sum(t) for t in zip(*ex_df['cv'])]
ex_absent_counts = [(len(ex_df) - count) for count in ex_term_counts]


In [51]:
# now we can map the term counts onto a dataframe to visualise the count for each class.
# can set the index as the term from the count vec dictionary

token_df = pd.DataFrame(data={'pres_inc':inc_term_counts,
                              'abs_inc':inc_absent_counts,
                              'pres_ex':ex_term_counts,
                              'abs_ex':ex_absent_counts},
                              index=cv.get_feature_names())
token_df.head()

Unnamed: 0,pres_inc,abs_inc,pres_ex,abs_ex
aa,13,306,183,3215
aa wilder,2,317,11,3387
aaa,0,319,13,3385
aac,1,318,14,3384
aak,0,319,21,3377


In [52]:
token_df.sort_values(by='pres_inc', ascending=False).head(20)

Unnamed: 0,pres_inc,abs_inc,pres_ex,abs_ex
clinical,284,35,2633,765
respiratory,281,38,2519,879
severe,280,39,2465,933
symptom,275,44,2129,1269
child,274,45,753,2645
reported,266,53,2633,765
sars,262,57,2467,931
cov,259,60,2319,1079
study,257,62,2611,787
sars cov,251,68,2180,1218


In [53]:
pickle.dump(token_df, open(f'./improved_token_count_df.p', 'wb'))