# <font color='red'>Cleaning text parts

In [1]:
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

Firstly we need to import the corpus we've already created:

In [2]:
import pickle
with open('whole_corpus.pickle', 'rb') as handle:
    df_corpus = pickle.load(handle)

Let's take a look into the corpus and check if there are duplicated rows:

In [3]:
df_corpus.describe()

Unnamed: 0,tag,abstract
count,26797,26797
unique,10,24564
top,DUE,Not Available
freq,3860,1436


In [4]:
df_corpus = df_corpus[df_corpus['abstract'] != 'Not Available']

In [5]:
from collections import Counter
# Search duplicates and keep only one from each
for text,count in Counter(df_corpus['abstract']).items():
    if count>1:
        df_corpus.drop(list(df_corpus.index[df_corpus['abstract']==text])[1::],axis=0,inplace=True)
        
# Get a summary of data again
display(df_corpus.describe())

Unnamed: 0,tag,abstract
count,24563,24563
unique,10,24563
top,DUE,Using SEEP II water column data and sedimentar...
freq,3816,1


A brief preprocessing in order to make the text ready for the calculation. Here uniformative characters e.g. punctuations and non-alphabetical ones are removed.

In [6]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

def clean_text(col):
    
    col = col.str.replace('\\n',' ').str.lower()
    col = col.str.replace('\\t',' ')
    col = col.str.replace('_',' ')
    col = col.str.replace('[^\\w\\s]',' ')
    col = col.apply(lambda x: x.translate(str.maketrans(' ',' ',string.digits)))
    
    return col

Here three topics are selected, *'CHE'* stands for Chemistry, *'DMS'* for Mathematics and *'IBN'* for Biology:

In [7]:
tags = ['CHE','DMS','IBN']
df_corpus = df_corpus.loc[df_corpus['tag'].isin(tags)]

In [8]:
df_corpus['abstract'] = clean_text(df_corpus['abstract'])

Then first lemmatization:

In [9]:
import nltk
from nltk.corpus import stopwords

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

df_corpus['abstract_lemmatized'] = df_corpus.abstract.apply(lemmatize_text)

Finally removing stopwords:

In [10]:
df_corpus['cleaned_abstract'] = df_corpus['abstract_lemmatized'].apply(lambda x: [item for item in x if item not in stopwords.words('english')])

Here we take a look into the dataset, in order to check later if there are comon frequent words in all categories:

In [11]:
from collections import Counter
tokens_CHE = df_corpus['cleaned_abstract'][df_corpus['tag'] == 'CHE'].tolist()
tokens_CHE = [item for sublist in tokens_CHE for item in sublist]
frequent_CHE = [ii[0] for ii in Counter(tokens_CHE).most_common(50)]

tokens_DMS = df_corpus['cleaned_abstract'][df_corpus['tag'] == 'DMS'].tolist()
tokens_DMS = [item for sublist in tokens_DMS for item in sublist]
frequent_DMS = [ii[0] for ii in Counter(tokens_DMS).most_common(50)]

tokens_IBN = df_corpus['cleaned_abstract'][df_corpus['tag'] == 'IBN'].tolist()
tokens_IBN = [item for sublist in tokens_IBN for item in sublist]
frequent_IBN = [ii[0] for ii in Counter(tokens_IBN).most_common(50)]

Removing all words of lenght less than 3:

In [12]:
set([ii for cell in df_corpus['cleaned_abstract'].tolist() for ii in cell if len(ii)<3])

{'aa',
 'ab',
 'ac',
 'ad',
 'af',
 'ag',
 'ah',
 'ai',
 'ak',
 'al',
 'ap',
 'ar',
 'au',
 'ax',
 'az',
 'b',
 'ba',
 'bc',
 'bf',
 'bi',
 'bl',
 'bm',
 'bn',
 'bo',
 'bp',
 'br',
 'bv',
 'bx',
 'bz',
 'c',
 'ca',
 'cc',
 'cd',
 'ce',
 'cf',
 'cg',
 'ch',
 'ci',
 'cl',
 'cm',
 'cn',
 'co',
 'cp',
 'cr',
 'ct',
 'cu',
 'cv',
 'cw',
 'cx',
 'cy',
 'da',
 'db',
 'dc',
 'dd',
 'de',
 'df',
 'dh',
 'di',
 'dl',
 'dm',
 'dp',
 'dr',
 'dt',
 'du',
 'dy',
 'dz',
 'e',
 'ea',
 'ec',
 'ed',
 'ee',
 'eg',
 'eh',
 'ei',
 'el',
 'em',
 'en',
 'eq',
 'er',
 'es',
 'et',
 'eu',
 'ev',
 'ew',
 'ex',
 'ez',
 'eõ',
 'f',
 'fa',
 'fc',
 'fe',
 'fh',
 'fj',
 'fk',
 'fl',
 'fm',
 'fn',
 'fo',
 'ft',
 'fu',
 'fv',
 'fx',
 'fy',
 'g',
 'ga',
 'gb',
 'gc',
 'gd',
 'ge',
 'gf',
 'gh',
 'gi',
 'gl',
 'gm',
 'gn',
 'go',
 'gp',
 'gr',
 'gt',
 'gu',
 'gx',
 'h',
 'ha',
 'hb',
 'hd',
 'hf',
 'hg',
 'hh',
 'hi',
 'ho',
 'hp',
 'hr',
 'ht',
 'hu',
 'hy',
 'hz',
 'ia',
 'ic',
 'id',
 'ii',
 'il',
 'im',
 'io',
 'ip'

In [13]:
df_corpus['cleaned_abstract'] = df_corpus['cleaned_abstract'].apply(lambda x: [s for s in x if len(s) > 2])

Building a new column to save cleand abstracts as text: 

In [15]:
df_corpus['cleaned_abstract'] = df_corpus['cleaned_abstract'].apply(' '.join)

In [16]:
df_corpus.head()

Unnamed: 0,tag,abstract,abstract_lemmatized,cleaned_abstract
a9000038.txt,DMS,this research is part of an on going program b...,"[this, research, is, part, of, an, on, going, ...",research part going program principal investig...
a9000053.txt,DMS,the mathematical theories of multivariate poly...,"[the, mathematical, theory, of, multivariate, ...",mathematical theory multivariate polynomial in...
a9000054.txt,DMS,work to be done during the period of this awar...,"[work, to, be, done, during, the, period, of, ...",work done period award focus higher dimensiona...
a9000075.txt,IBN,in collaboration with costa rican graduate stu...,"[in, collaboration, with, costa, rican, gradua...",collaboration costa rican graduate student sci...
a9000094.txt,IBN,the continued destruction of the coastal and t...,"[the, continued, destruction, of, the, coastal...",continued destruction coastal tropical forest ...


Finding common frequent words between all categories:

In [17]:
common_words = []
for ii in frequent_CHE:
    if ii in frequent_DMS and ii in frequent_IBN:
        print(ii)
        common_words.append(ii)

research
study
new
project
system
important
work


As those common words may affect the model negatively, we are going to remove them from the abstracts:

In [18]:
for ii in common_words:
    df_corpus['cleaned_abstract'] = df_corpus['cleaned_abstract'].apply(lambda x: x.replace(ii, ''))

In [19]:
from IPython.display import display 
df_corpus.drop('abstract_lemmatized',axis=1, inplace=True)
display(df_corpus.head())

Unnamed: 0,tag,abstract,cleaned_abstract
a9000038.txt,DMS,this research is part of an on going program b...,part going program principal investigator ass...
a9000053.txt,DMS,the mathematical theories of multivariate poly...,mathematical theory multivariate polynomial in...
a9000054.txt,DMS,work to be done during the period of this awar...,done period award focus higher dimensional in...
a9000075.txt,IBN,in collaboration with costa rican graduate stu...,collaboration costa rican graduate student sci...
a9000094.txt,IBN,the continued destruction of the coastal and t...,continued destruction coastal tropical forest ...


In [20]:
df_corpus.describe()

Unnamed: 0,tag,abstract,cleaned_abstract
count,7896,7896,7896
unique,3,7895,7888
top,DMS,this award will fund a mathematical sciences r...,award support development computer software e...
freq,3672,2,2


As it's shown above, there are some duplicated rows in the dataset, which must be removed:

In [21]:
for text,count in Counter(df_corpus['cleaned_abstract']).items():
    if count>1:
        df_corpus.drop(list(df_corpus.index[df_corpus['cleaned_abstract']==text])[1::],axis=0,inplace=True)

In [22]:
df_corpus.describe()

Unnamed: 0,tag,abstract,cleaned_abstract
count,7888,7888,7888
unique,3,7888,7888
top,DMS,in this project in the physical chemistry prog...,grand challenge application group competition ...
freq,3666,1,1


In order to prevent redoing cleaning part, the data is saved as a pickle file:

In [23]:
import pickle

with open('corpus_lemmatized.pickle', 'wb') as handle:
    pickle.dump(df_corpus, handle, protocol=pickle.HIGHEST_PROTOCOL)