In [27]:
from nltk.stem import WordNetLemmatizer
import contractions
import string
import pickle
import pandas as pd
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [28]:
df = pd.read_csv('candidate-passages-top1000.tsv', delimiter='\t', header=None, names=['qid','pid','query','passage'])

In [29]:
passages = df[['pid', 'passage']].copy()
passages = passages.drop_duplicates()
passages = passages.reset_index(drop=True)

In [30]:
def normalise(text):
    '''
    Function that normalises text and returns tokens.
    Input: text --> text string we want to tokenise
    Output: tokens --> list of tokens taken from the text string
    '''

    text = text.lower() # convert all to lower case
    text = contractions.fix(text) # expand contractions
    # text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    tokens = text.split()
    # tokens = re.findall(r'(\b[a-z|1-9|\S]+\b)', text) # tokenisation
    filtered_tokens = [w for w in tokens if not w in stop_words] # remove stop words
    filtered_tokens = list(map(lemmatizer.lemmatize, filtered_tokens)) # lemmatization of nouns

    return filtered_tokens

In [31]:
inv_index = {}

def inverted_index_func(row):
    pid = row['pid']
    passage = row['passage']
    check = normalise(passage)
    unique_words = list(set(check))

    for item in unique_words:
        if item not in inv_index:
            inv_index[item] = {}
        if item in inv_index:
            inv_index[item][pid] = check.count(item) # Frequency of the word (not normalised)

_ = passages.apply(lambda row: inverted_index_func(row), axis=1)

In [32]:
with open('inverted_index.pkl', 'wb') as f:
    pickle.dump(inv_index, f)

In [34]:
inv_index.keys()

