# Document normalization

In [1]:
import sys
import os

sys.path.append(os.path.abspath('..//../techminer/'))
from docs_normalizer import DocNormalizer
import pandas as pd
from techminer import RecordsDataFrame

In [2]:
rdf = RecordsDataFrame(
    pd.read_json(
        'step-07.json', 
        orient='records', 
        lines=True))

## Title

In [3]:
doc_normalizer = DocNormalizer()

In [4]:
doc_normalizer.fit(rdf['Title'])

Loaded 326 stopwords


DocNormalizer(words_keep=None)

#### Stop words loaded to clean text

In [5]:
doc_normalizer.stopwords_[:10]

['itself',
 "'re",
 'does',
 'full',
 'eight',
 'although',
 'as',
 'whether',
 'could',
 'hereby']

#### Document cleansing - Title

In [6]:
docs_normalized = doc_normalizer.transform(rdf['Title'])

Normalizing documents


In [7]:
print('Fisrt document before the process of normalization\n')
print(rdf.loc[0,'Title'], '\n')

Fisrt document before the process of normalization

Improving DWT-RNN model via B-spline wavelet multiresolution to forecast a high-frequency time series 



In [8]:
print('Fisrt document after the process of normalization\n')
print(docs_normalized[0], '\n')

Fisrt document after the process of normalization

improve dwtrnn model via bspline wavelet multiresolution to forecast a highfrequency time series 



#### For the whole corpus of titles

In [9]:
corpus = list(rdf.loc[:,'Title'].values)
vocabulary = [word for doc in corpus for word in doc.split()]

In [10]:
print(f'Number of words in the corpus: {len(vocabulary)}')
print(f'Number of words in the vocabulary of the corpus: {len(set(vocabulary))}')

Number of words in the corpus: 1654
Number of words in the vocabulary of the corpus: 579


In [11]:
vocabulary_cleaned = [word for doc in docs_normalized for word in doc.split()]

In [12]:
print(f'Number of words in the corpus: {len(vocabulary_cleaned)}')
print(f'Number of words in the vocabulary of the corpus: {len(set(vocabulary_cleaned))}')

Number of words in the corpus: 1652
Number of words in the vocabulary of the corpus: 443


In [13]:
print(f'Reduction of {round(1 - len(vocabulary_cleaned)/len(vocabulary),4)*100}% from the total number of words in the corpus')
print(f'Reduction of {round(1 - len(set(vocabulary_cleaned))/len(set(vocabulary)),3)*100}% from the uncleaned vocabulary')

Reduction of 0.12% from the total number of words in the corpus
Reduction of 23.5% from the uncleaned vocabulary


## Abstract

In [14]:
doc_normalizer = DocNormalizer()

In [15]:
doc_normalizer.fit(rdf['Abstract'])

Loaded 326 stopwords


DocNormalizer(words_keep=None)

#### Document cleansing - Abstract

In [16]:
docs_normalized = doc_normalizer.transform(rdf['Abstract'])

Normalizing documents


In [17]:
print('Fisrt document before the process of normalization\n')
print(rdf.iloc[0,0], '\n')

Fisrt document before the process of normalization

The importance of an interference-less machine learning scheme in time series prediction is crucial, as an oversight can have a negative cumulative effect, especially when predicting many steps ahead of the currently available data. The on-going research on noise elimination in time series forecasting has led to a successful approach of decomposing the data sequence into component trends to identify noise-inducing information. The empirical mode decomposition method separates the time series/signal into a set of intrinsic mode functions ranging from high to low frequencies, which can be summed up to reconstruct the original data. The usual assumption that random noises are only contained in the high-frequency component has been shown not to be the case, as observed in our previous findings. The results from that experiment reveal that noise can be present in a low frequency component, and this motivates the newly-proposed algorithm. A

In [18]:
print('Fisrt document after the process of normalization\n')
print(docs_normalized[0], '\n')

Fisrt document after the process of normalization

the importance of an interferenceless machine learn scheme in time series prediction be crucial as an oversight can have a negative cumulative effect especially when predict many step ahead of the currently available datum the ongoing research on noise elimination in time series forecasting have lead to a successful approach of decompose the data sequence into component trend to identify noiseinduce information the empirical mode decomposition method separate the time series signal into a set of intrinsic mode function range from high to low frequency which can be sum up to reconstruct the original datum the usual assumption that random noise be only contain in the highfrequency component have be show not to be the case as observe in previous finding the result from that experiment reveal that noise can be present in a low frequency component and this motivate the newlyproposed algorithm additionally to prevent the erosion of periodic 

#### For the whole corpus of documents

In [19]:
corpus = list(rdf.iloc[:,0].values)
vocabulary = [word for doc in corpus for word in doc.split()]

In [20]:
print(f'Number of words in the corpus: {len(vocabulary)}')
print(f'Number of words in the vocabulary of the corpus: {len(set(vocabulary))}')

Number of words in the corpus: 25473
Number of words in the vocabulary of the corpus: 4920


In [21]:
vocabulary_cleaned = [word for doc in docs_normalized for word in doc.split()]

In [22]:
print(f'Number of words in the corpus: {len(vocabulary_cleaned)}')
print(f'Number of words in the vocabulary of the corpus: {len(set(vocabulary_cleaned))}')

Number of words in the corpus: 24900
Number of words in the vocabulary of the corpus: 2746


In [23]:
print(f'Reduction of {round(1 - len(vocabulary_cleaned)/len(vocabulary),4)*100}% from the total number of words in the corpus')
print(f'Reduction of {round(1 - len(set(vocabulary_cleaned))/len(set(vocabulary)),3)*100}% from the uncleaned vocabulary')

Reduction of 2.25% from the total number of words in the corpus
Reduction of 44.2% from the uncleaned vocabulary
