In [11]:
from typing import List, Tuple, Dict
import re

In [12]:
class InvertedIndex:
    def __init__(self):
        self.dictionary = {}
        self.postings_lists = {}
        self.postings_list_id_counter = 0
        self.tweet_texts = {}

    def normalize_term(self, term: str) -> str:
        """
        Normalize the term by converting it to lowercase and removing any non-alphanumeric characters.
        """
        return re.sub(r"\W+", "", term.lower())
    
    def get_tweet_texts(self, tweet_ids: List[str]) -> List[str]:
        '''
        Get the text content of tweets given their IDs.
        '''
        return [self.tweet_texts[tweet_id] for tweet_id in tweet_ids if tweet_id in self.tweet_texts]

    def index(self, filename: str):
        """
        Index the documents in the given file.
        """
        with open(filename, "r", encoding="utf-8") as file:
            for line in file:
                columns = line.strip().split("\t")
                if len(columns) < 5:
                    continue
                tweet_id = columns[1]
                tweet_text = columns[4]
                self.tweet_texts[tweet_id] = tweet_text
                terms = tweet_text.split()
                unique_terms = set()
                for term in terms:
                    normalized_term = self.normalize_term(term)
                    if normalized_term and normalized_term not in unique_terms:
                        unique_terms.add(normalized_term)
                        if normalized_term not in self.dictionary:
                            postings_list_id = self.postings_list_id_counter
                            self.postings_lists[postings_list_id] = []
                            self.dictionary[normalized_term] = (0, postings_list_id)
                            self.postings_list_id_counter += 1
                        size, postings_list_id = self.dictionary[normalized_term]
                        postings_list = self.postings_lists[postings_list_id]
                        if not postings_list or postings_list[-1][0] != tweet_id:
                            postings_list.append((tweet_id, None))
                            self.dictionary[normalized_term] = (
                                size + 1,
                                postings_list_id,
                            )
                            if len(postings_list) > 1:
                                postings_list[-2] = (
                                    postings_list[-2][0],
                                    len(postings_list) - 1,
                                )

    def query(self, term: str) -> List[Tuple[str, int]]:
        """
        Query the index for a single term and return the postings list.
        """
        normalized_term = self.normalize_term(term)
        if normalized_term in self.dictionary:
            size, postings_list_id = self.dictionary[normalized_term]
            return self.postings_lists[postings_list_id]
        return []

In [13]:
class InvertedIndex(InvertedIndex):
    def intersect_postings_lists(
        self,
        postings_list1: List[Tuple[str, int]],
        postings_list2: List[Tuple[str, int]],
    ) -> List[str]:
        """
        Intersect two postings lists and return the common document IDs.
        """
        result = []
        iter1 = iter(postings_list1)
        iter2 = iter(postings_list2)
        posting1 = next(iter1, None)
        posting2 = next(iter2, None)
        while posting1 is not None and posting2 is not None:
            doc_id1, next_posting1 = posting1
            doc_id2, next_posting2 = posting2
            if doc_id1 == doc_id2:
                result.append(doc_id1)
                posting1 = next(iter1, None) if next_posting1 is not None else None
                posting2 = next(iter2, None) if next_posting2 is not None else None
            elif doc_id1 < doc_id2:
                posting1 = next(iter1, None) if next_posting1 is not None else None
            else:
                posting2 = next(iter2, None) if next_posting2 is not None else None
        return result

    def query(self, term1: str, term2: str = None) -> List[str]:
        """
        Query the index for one or two terms and return the document IDs.
        """
        postings_list1 = super().query(term1)
        if term2 is not None:
            postings_list2 = super().query(term2)
            return self.intersect_postings_lists(postings_list1, postings_list2)
        return [doc_id for doc_id, _ in postings_list1]


In [14]:
# Test the query method
index = InvertedIndex()
index.index("tweets.csv")

In [15]:
index.dictionary

{'knakatani': (11, 0),
 'chikonjugular': (1, 1),
 'joofford': (1, 2),
 'steveblogs11': (1, 3),
 'httpstcowhtarygnsy': (1, 4),
 'says': (203, 5),
 'lifetime': (24, 6),
 'risk': (293, 7),
 'of': (11572, 8),
 'cervical': (38, 9),
 'cancer': (685, 10),
 'in': (20707, 11),
 'japan': (114, 12),
 'is': (9369, 13),
 '1': (1448, 14),
 '100': (470, 15),
 'that': (8408, 16),
 'means': (207, 17),
 'hpv': (54, 18),
 'endemic': (1, 19),
 'and': (15759, 20),
 'screening': (32, 21),
 'not': (5413, 22),
 'working': (222, 23),
 'well': (1125, 24),
 'fischerkurt': (1, 25),
 'lady': (124, 26),
 'whats': (235, 27),
 'a': (13011, 28),
 'tumor': (734, 29),
 'kippcharts': (1, 30),
 'kings_of_metal': (1, 31),
 'ohne': (1044, 32),
 'diagnoseverdacht': (1, 33),
 'ist': (14585, 34),
 'es': (10964, 35),
 'nunmal': (27, 36),
 'schwer': (250, 37),
 'gerade': (1221, 38),
 'für': (7771, 39),
 'einen': (2888, 40),
 'hausarzt': (20, 41),
 'am': (4928, 42),
 'blutbild': (2, 43),
 'kann': (3715, 44),
 'man': (5693, 45),
 

In [32]:
resp = index.query("cancer", "vaccine")

print(resp)

index.get_tweet_texts(resp)

['968924030798581762', '971150719981113344', '979336440797638656']


['Cancer ‘vaccine’ eradicates tumors in mice, holds promise in humans - https://t.co/hLSVBUrN8w via @Shareaholic',
 'Australia could become first country to eradicate cervical cancer. Free HPV vaccine program in schools has led to a dramatic decline in future cervical cancer rates.  HPV (human papillomavirus) is a sexually transmitted infection that causes 99.9% of cervical cancers.#TheGuardian',
 "A Cancer 'Vaccine' Cured 97% of Tumors in Mice. What's That Mean for People? https://t.co/W08njxJXuD"]

In [29]:
# (side effects and malaria vaccines) or (side effects and covid vaccines)

resp = index.query("Corona")

print(resp)


['1017787802946293761']
