In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
with open("cranfield/cran_docs.json",'r') as f:
    docs = json.load(f)
docs_df = pd.DataFrame(docs)

with open("cranfield/cran_qrels.json",'r') as f:
    rels = json.load(f)
rels_df = pd.DataFrame(rels)

with open("cranfield/cran_queries.json",'r') as f:
    queries = json.load(f)
queries_df = pd.DataFrame(queries)

In [6]:
docs_df.head()

Unnamed: 0,id,author,bibliography,body,title
0,1,"brenckman,m.","j. ae. scs. 25, 1958, 324.",experimental investigation of the aerodynamics...,experimental investigation of the aerodynamics...
1,2,ting-yili,"department of aeronautical engineering, rensse...",simple shear flow past a flat plate in an inco...,simple shear flow past a flat plate in an inco...
2,3,m. b. glauert,"department of mathematics, university of manch...",the boundary layer in simple shear flow past a...,the boundary layer in simple shear flow past a...
3,4,"yen,k.t.","j. ae. scs. 22, 1955, 728.",approximate solutions of the incompressible la...,approximate solutions of the incompressible la...
4,5,"wasserman,b.","j. ae. scs. 24, 1957, 924.",one-dimensional transient heat conduction into...,one-dimensional transient heat conduction into...


In [10]:
# preprocess corpus
from nltk.tokenize import sent_tokenize, TreebankWordTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [34]:
def sent_tokenizer(doc_body):
    '''
    doc_body: a string
    -------
    return: a list of sentences
    '''
    return sent_tokenize(doc_body)

def word_tokenizer(sent_list):
    '''
    sent_list: list of sentences
    -------
    return: a list of list of tokens
    '''
    tokenizer = TreebankWordTokenizer()
    res = []
    for sent in sent_list:
        tokenized_list = tokenizer.tokenize(sent)
        res.append(tokenized_list)
    return res


def stopword_removal(token_list):
    '''
    token_list: list of list of tokens
    -------
    return: list of list of tokens without stopwords 
    '''
    stopwords_set = set(stopwords.words("english"))
    res = []
    for sent in token_list:
        removed_list = [token for token in sent if token.lower() not in stopwords_set]
        res.append(removed_list)
    
    return res

def inflection_reduction(token_list):
    '''
    token_list: list of list of tokens
    -------
    return: list of list of tokens without stopwords 
    '''
    stemmer = PorterStemmer()
    res = []
    for sent in token_list:
        stemmed_list = [stemmer.stem(token) for token in sent]
        res.append(stemmed_list)
    
    return res

def rejoin(sent_list):
    '''
    token_list: list of list of tokens
    -------
    return: a str 
    '''
    res = ""
    for sent in sent_list:
        res += " ".join(sent)
    return res

In [36]:
# doc = docs_df['body'][0]
# sent = sent_tokenizer(doc)
# tokenized = word_tokenizer(sent)
# stemmed = inflection_reduction(tokenized)
# stopword_removed = stopword_removal(stemmed)
# preprocessed_doc = rejoin(stopword_removed)


In [37]:
corpus = []
for doc in docs_df['body']:
    sent = sent_tokenizer(doc)
    tokenized = word_tokenizer(sent)
    stemmed = inflection_reduction(tokenized)
    stopword_removed = stopword_removal(stemmed)
    preprocessed_doc = rejoin(stopword_removed)
    corpus.append(preprocessed_doc)

In [38]:
from sentence_transformer import sentence_encoder

ModuleNotFoundError: No module named 'sentence_transformer'