# Inverted-index representation 
Author : ERRAMI Fatimezzahra<br/>
Link to the original repo : https://github.com/F-Errami/IR-algorithms

In [126]:
# AUTHOR : ERRAMI Fatimezahra 
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

[nltk_data] Error loading punkt: <urlopen error [WinError 10060] Une
[nltk_data]     tentative de connexion a échoué car le parti connecté
[nltk_data]     n’a pas répondu convenablement au-delà d’une certaine
[nltk_data]     durée ou une connexion établie a échoué car l’hôte de
[nltk_data]     connexion n’a pas répondu>
[nltk_data] Error loading stopwords: <urlopen error [WinError 10060]
[nltk_data]     Une tentative de connexion a échoué car le parti
[nltk_data]     connecté n’a pas répondu convenablement au-delà d’une
[nltk_data]     certaine durée ou une connexion établie a échoué car
[nltk_data]     l’hôte de connexion n’a pas répondu>
[nltk_data] Error loading wordnet: <urlopen error [WinError 10060] Une
[nltk_data]     tentative de connexion a échoué car le parti connecté
[nltk_data]     n’a pas répondu convenablement au-delà d’une certaine
[nltk_data]     durée ou une connexion établie a échoué car l’hôte de
[nltk_data]     connexion n’a pas répondu>
[nltk_data] Downloading pack

In [127]:
# Collection of documents (corpus)
doc1 = "A professional business male, late 40s, 6 feet tall, slim build, well groomed, great personality, home owner, interests include the arts travel and all things good, Ringwood area, is seeking a genuine female of similar age or older, in same area or surrounds, for a meaningful long term rship. Looking forward to hearing from you all."
doc2 = "MALE LATE 50''s AUST Single, tall, prof. Interests: Music, theatre, dining, art, the beach and the environment. Seeking female with similar interests to share concerts, dining etc."
doc3 = "GENUINE AND HONEST Hi Im 44 with a good sense of humour, am romantic and love drives, fishing, camping and music. Love my 2 kids. Am looking for a lady with similar interests, aged between 38-45 for friendship/ possible relationship."

In [128]:
docs = [doc1, doc2, doc3]
docs

['A professional business male, late 40s, 6 feet tall, slim build, well groomed, great personality, home owner, interests include the arts travel and all things good, Ringwood area, is seeking a genuine female of similar age or older, in same area or surrounds, for a meaningful long term rship. Looking forward to hearing from you all.',
 "MALE LATE 50''s AUST Single, tall, prof. Interests: Music, theatre, dining, art, the beach and the environment. Seeking female with similar interests to share concerts, dining etc.",
 'GENUINE AND HONEST Hi Im 44 with a good sense of humour, am romantic and love drives, fishing, camping and music. Love my 2 kids. Am looking for a lady with similar interests, aged between 38-45 for friendship/ possible relationship.']

### Text processing 

In [129]:
def text_preprocessing(corpus) :
    print(corpus)
    # Tokenize the given document
    tokenized = word_tokenize(corpus)
    
    # Lower case all words
    tokenized = [word.lower() for word in tokenized]
    
    # Remove stopwords
    stopwords_en = set(stopwords.words("english"))
    stopwords_en = stopwords_en.union(set(punctuation))
    tokenized_without_sw = [word for word in tokenized if not word in stopwords_en]
    
    # Stemming 
    #porter = PorterStemmer()
    #tokenized_without_sw = [porter.stem(word) for word in tokenized_without_sw]
    
    # POS tagging 
    doc_tagged = pos_tag(tokenized_without_sw)
    
    # Lemmatizer
    wnl = WordNetLemmatizer()
    result = [wnl.lemmatize(word, pos=penn2morphy(tag[:2])) for word, tag in doc_tagged]
    
    return result

In [130]:
def penn2morphy(tag) : 
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[tag]
    except:
        return 'n' # if mapping isn't found, fall back to Noun.

In [131]:
def and_postings(posting1, posting2):
    p1 = 0
    p2 = 0
    result = list()
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            p2 += 1
        else:
            p1 += 1
    return result

In [132]:
def or_postings(posting1, posting2):
    p1 = 0
    p2 = 0
    result = list()
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            result.append(posting2[p2])
            p2 += 1
        else:
            result.append(posting1[p1])
            p1 += 1
    while p1 < len(posting1):
        result.append(posting1[p1])
        p1 += 1
    while p2 < len(posting2):
        result.append(posting2[p2])
        p2 += 1
    return result

In [133]:
docs_preprocessed = []
for doc in docs : 
    docs_preprocessed.append(text_preprocessing(doc))
    
docs_preprocessed

A professional business male, late 40s, 6 feet tall, slim build, well groomed, great personality, home owner, interests include the arts travel and all things good, Ringwood area, is seeking a genuine female of similar age or older, in same area or surrounds, for a meaningful long term rship. Looking forward to hearing from you all.
MALE LATE 50''s AUST Single, tall, prof. Interests: Music, theatre, dining, art, the beach and the environment. Seeking female with similar interests to share concerts, dining etc.
GENUINE AND HONEST Hi Im 44 with a good sense of humour, am romantic and love drives, fishing, camping and music. Love my 2 kids. Am looking for a lady with similar interests, aged between 38-45 for friendship/ possible relationship.


[['professional',
  'business',
  'male',
  'late',
  '40',
  '6',
  'foot',
  'tall',
  'slim',
  'build',
  'well',
  'groom',
  'great',
  'personality',
  'home',
  'owner',
  'interest',
  'include',
  'art',
  'travel',
  'thing',
  'good',
  'ringwood',
  'area',
  'seek',
  'genuine',
  'female',
  'similar',
  'age',
  'old',
  'area',
  'surround',
  'meaningful',
  'long',
  'term',
  'rship',
  'look',
  'forward',
  'hear'],
 ['male',
  'late',
  '50',
  "''",
  'aust',
  'single',
  'tall',
  'prof',
  'interest',
  'music',
  'theatre',
  'din',
  'art',
  'beach',
  'environment',
  'seek',
  'female',
  'similar',
  'interest',
  'share',
  'concert',
  'din',
  'etc'],
 ['genuine',
  'honest',
  'hi',
  'im',
  '44',
  'good',
  'sense',
  'humour',
  'romantic',
  'love',
  'drive',
  'fish',
  'camp',
  'music',
  'love',
  '2',
  'kid',
  'look',
  'lady',
  'similar',
  'interest',
  'age',
  '38-45',
  'friendship/',
  'possible',
  'relationship']]

In [134]:
# Gather the set of all unique terms
unique_terms = {term for doc in docs_preprocessed for term in doc}
unique_terms

{"''",
 '2',
 '38-45',
 '40',
 '44',
 '50',
 '6',
 'age',
 'area',
 'art',
 'aust',
 'beach',
 'build',
 'business',
 'camp',
 'concert',
 'din',
 'drive',
 'environment',
 'etc',
 'female',
 'fish',
 'foot',
 'forward',
 'friendship/',
 'genuine',
 'good',
 'great',
 'groom',
 'hear',
 'hi',
 'home',
 'honest',
 'humour',
 'im',
 'include',
 'interest',
 'kid',
 'lady',
 'late',
 'long',
 'look',
 'love',
 'male',
 'meaningful',
 'music',
 'old',
 'owner',
 'personality',
 'possible',
 'prof',
 'professional',
 'relationship',
 'ringwood',
 'romantic',
 'rship',
 'seek',
 'sense',
 'share',
 'similar',
 'single',
 'slim',
 'surround',
 'tall',
 'term',
 'theatre',
 'thing',
 'travel',
 'well'}

In [135]:
# Construct an inverted index
# here as a Python dictionary for ease of interpretability
import collections 

inverted_index = {}

for i, doc in enumerate(docs_preprocessed):
    for term in doc:
        if not term in inverted_index :
            inverted_index[term] = {'df': 1, 'postings_list': set()}
            inverted_index[term]['postings_list'].add(i)
        else : 
            inverted_index[term]['postings_list'].add(i)
            inverted_index[term]['df'] += 1

inverted_index

{'professional': {'df': 1, 'postings_list': {0}},
 'business': {'df': 1, 'postings_list': {0}},
 'male': {'df': 2, 'postings_list': {0, 1}},
 'late': {'df': 2, 'postings_list': {0, 1}},
 '40': {'df': 1, 'postings_list': {0}},
 '6': {'df': 1, 'postings_list': {0}},
 'foot': {'df': 1, 'postings_list': {0}},
 'tall': {'df': 2, 'postings_list': {0, 1}},
 'slim': {'df': 1, 'postings_list': {0}},
 'build': {'df': 1, 'postings_list': {0}},
 'well': {'df': 1, 'postings_list': {0}},
 'groom': {'df': 1, 'postings_list': {0}},
 'great': {'df': 1, 'postings_list': {0}},
 'personality': {'df': 1, 'postings_list': {0}},
 'home': {'df': 1, 'postings_list': {0}},
 'owner': {'df': 1, 'postings_list': {0}},
 'interest': {'df': 4, 'postings_list': {0, 1, 2}},
 'include': {'df': 1, 'postings_list': {0}},
 'art': {'df': 2, 'postings_list': {0, 1}},
 'travel': {'df': 1, 'postings_list': {0}},
 'thing': {'df': 1, 'postings_list': {0}},
 'good': {'df': 2, 'postings_list': {0, 2}},
 'ringwood': {'df': 1, 'post

In [136]:
# Now we can get posting lists for any term

In [137]:
posting_list = inverted_index['male']['postings_list']
posting_list

{0, 1}

In [138]:
query = "interest OR male"
query = query.split(" OR ")

pl_1 = list(inverted_index[query[0]]['postings_list'])
pl_2 = list(inverted_index["".join(text_preprocessing(query[1]))]['postings_list'])
results = or_postings(pl_1, pl_2)
for rs in results:
    print(docs[rs]+"\n")

male
A professional business male, late 40s, 6 feet tall, slim build, well groomed, great personality, home owner, interests include the arts travel and all things good, Ringwood area, is seeking a genuine female of similar age or older, in same area or surrounds, for a meaningful long term rship. Looking forward to hearing from you all.

MALE LATE 50''s AUST Single, tall, prof. Interests: Music, theatre, dining, art, the beach and the environment. Seeking female with similar interests to share concerts, dining etc.

GENUINE AND HONEST Hi Im 44 with a good sense of humour, am romantic and love drives, fishing, camping and music. Love my 2 kids. Am looking for a lady with similar interests, aged between 38-45 for friendship/ possible relationship.



In [139]:
query = "seek AND build"
query = query.split(" AND ")

pl_1 = list(inverted_index[query[0]]['postings_list'])
pl_2 = list(inverted_index[query[1]]['postings_list'])
results = and_postings(pl_1, pl_2)
for rs in results:
    print(docs[rs]+"\n")

A professional business male, late 40s, 6 feet tall, slim build, well groomed, great personality, home owner, interests include the arts travel and all things good, Ringwood area, is seeking a genuine female of similar age or older, in same area or surrounds, for a meaningful long term rship. Looking forward to hearing from you all.



In [140]:
query = "the romantic and build as home"
normalized_query = text_preprocessing(query)

normalized_query

the romantic and build as home


['romantic', 'build', 'home']

In [142]:
results = or_postings(or_postings(list(inverted_index[normalized_query[0]]['postings_list']), list(inverted_index[normalized_query[1]]['postings_list'])), list(inverted_index[normalized_query[2]]['postings_list']))

for rs in results:
    print(docs[rs]+"\n")


A professional business male, late 40s, 6 feet tall, slim build, well groomed, great personality, home owner, interests include the arts travel and all things good, Ringwood area, is seeking a genuine female of similar age or older, in same area or surrounds, for a meaningful long term rship. Looking forward to hearing from you all.

GENUINE AND HONEST Hi Im 44 with a good sense of humour, am romantic and love drives, fishing, camping and music. Love my 2 kids. Am looking for a lady with similar interests, aged between 38-45 for friendship/ possible relationship.

