##### We have a small set of data in the form of tweets. Each line in the file begins with a document ID, followed by the text of the tweet. Implementing a function to create an inverted index of these documents. 

In [None]:
# Mounting the Drive to access the text file in the same folder


from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Importing required libraries


import re
import nltk
import pandas as pd
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Reading the Twitter corpus from the text file


f = open('/content/drive/My Drive/Colab Notebooks/NLP_Assignment_4/tweets_corpus.txt', 'r', encoding = "utf-8")
tweet_corpus = f.read()
f.close()

In [None]:
tweet_corpus  # Printing the corpus

In [None]:
len(str(81499877556760576))  # Length of doc id's is 17

In [None]:
# Document ID's from the Twitter corpus is taken out
# with regex and saved in a list


doc_ids = re.findall(r'\d{17}', tweet_corpus)
doc_ids[:3]

In [None]:
# Tweets from the corpus is taken out
# and saved in a list


tweets = re.findall(r'\t(.*?)\n', tweet_corpus)
tweets[:3]

In [None]:
# Preprocessing text


# Importing libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer #Stemming


# Removing Punctuations
no_punc = [re.sub(r'[!"#$%&\'()*+,-./:;<=>?@\\^_`{|}~]', " ", word) for word in tweets]


# Tokenizing
tweet_token = []
for tweet in no_punc:
  tweet_token.append(word_tokenize(tweet))


# Removing stop words
stop_words = set(stopwords.words('english')) 


# Removing stop words and sentences less than 3 words
# and lowering text 
filtered_tweets = []
for tweet in tweet_token:
  filtered_sent = []
  for w in tweet:
    if w not in stop_words and (len(w) >= 3):
      filtered_sent.append(w.lower())
  filtered_tweets.append(filtered_sent)


filtered_tweets[:2]

In [None]:
# Term frequency calculation

# tf = []
# for tweet in filtered_tweets:
#   tf.append([(x, tweet.count(x)/len(tweet)) for x in set(tweet)])

# tf[:2]

In [None]:
# Inverse Document Frequency

# docs_count = len(doc_ids)


# number of documents containing the word 
# idf = []
# for tweet in filtered_tweets:
#   count_doc = []
#   for w in tweet:
#     count_doc.append(len(inverted_index[w])/docs_count)
#   idf.append(count_doc[1:])

# idf[:2]

In [None]:
# Stemming tokens

# stem_tweet = []
# ps = PorterStemmer()
# for tweet in tweet_token:
#   stems = []
#   for w in tweet:
#     stems.append(ps.stem(w))
#   stem_tweet.append(stems)

In [None]:
# Showing the data

df = pd.DataFrame()
df["Doc_ids"] = doc_ids
df["Tweets"] = filtered_tweets
df.head()

In [None]:
from collections import defaultdict

inverted_index = defaultdict(list)

i = 0
for tweet in filtered_tweets:
  for w in tweet:
    inverted_index[w].append(doc_ids[i])
  i+=1

inverted_index

<br>

#####  Writing a function to implement the merge algorithm. Your code should allow intersecting the postings of two terms, as well as process simple Boolean queries. When there are multiple query terms, make sure that your algorithm uses the optimization described in Manning book of performing the most restrictive intersection first.

In [None]:
def AND(posting1, posting2):
    p1 = 0
    p2 = 0
    result = list()
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            p2 += 1
        else:
            p1 += 1
    return result

In [None]:
and_ans = AND(inverted_index["cheddar"], inverted_index["cheese"])

print("The documents with \"cheese\" and \"cheddar\" in them are:")
print(and_ans)
print("\n")
print("Document(s) Contents:")
for ans in and_ans:
  print(tweets[doc_ids.index(ans)])

In [None]:
def OR(posting1, posting2):
    p1 = 0
    p2 = 0
    result = list()
    while p1 < len(posting1) and p2 < len(posting2):
        if posting1[p1] == posting2[p2]:
            result.append(posting1[p1])
            p1 += 1
            p2 += 1
        elif posting1[p1] > posting2[p2]:
            result.append(posting2[p2])
            p2 += 1
        else:
            result.append(posting1[p1])
            p1 += 1
    while p1 < len(posting1):
        result.append(posting1[p1])
        p1 += 1
    while p2 < len(posting2):
        result.append(posting2[p2])
        p2 += 1
    return result

In [None]:
or_ans = OR(inverted_index["cheddar"], inverted_index["cookies"])

print("The documents with \"cookies\" or \"cheddar\" in them are:")
print(or_ans)
print("\n")
print("Document(s) Contents:")
for ans in or_ans:
  print(tweets[doc_ids.index(ans)])

In [None]:
def querying(query):
  temp_query = query.lower()
  individual_query = re.findall(r'\((.*?)\)', temp_query)
  additional_op = re.findall(r'\) (.*?) \(', temp_query)

  operation = []
  if individual_query != []:
    answers = []
    for q in individual_query:
      temp_terms = []
      temp_op = ""
      for w in q.split():
        if w != "and" and w != "or":
            temp_terms.append(w)
        else:
            temp_op = w
      if temp_op == "and":
        answers.append(AND(inverted_index[temp_terms[0]], inverted_index[temp_terms[1]]))
      else:
        answers.append(OR(inverted_index[temp_terms[0]], inverted_index[temp_terms[1]]))
    i = 0
    answer = []
    for o in additional_op: 
      if o == 'and':
        answer.append(AND(answers[i], answers[i+1]))
      else: 
        answer.append(OR(answers[i], answers[i+1]))
      i+=1
    return answer
  elif (len(temp_query.split())>3):
    for q in temp_query.split():
      temp_terms = []
      temp_op = []
      if q != "and" and q != "or":
          temp_terms.append(q)
      else:
          temp_op.append(q)
    p1 = inverted_index[temp_terms[0]]
    i = 1; j = 0
    while i < len(temp_terms):
      p2 = inverted_index[temp_terms[i]]
      if temp_op == "and":
        p1 = AND(inverted_index[temp_terms[p1]], inverted_index[temp_terms[p2]])
      else:
        p1 = OR(inverted_index[temp_terms[p1]], inverted_index[temp_terms[p2]])
      i+=1; j+=1
    answer = p1
    return answer
  else: 
    terms = []
    op = ""
    for w in temp_query.split():
        if w != "and" and w != "or":
            terms.append(w)
        else:
            op = w
    if op == "and":
      answer = AND(inverted_index[terms[0]], inverted_index[terms[1]])
    else:
      answer = OR(inverted_index[terms[0]], inverted_index[terms[1]])
    return answer

In [None]:
querying("egg and cheese")

In [None]:
querying("egg and cheese and cheesecake")

In [None]:
querying("egg or cheese")

In [None]:
querying("egg and cheese or cookies")

In [None]:
querying("(egg and cheese) or (cookies and cream)")

In [None]:
# Code for optimization

# posting_len = []
# for n in terms:
#   posting_len.append(len(inverted_index[n]))

# sorted_terms_list = [x for _,x in sorted(zip(posting_len,terms))]
# print(sorted_terms_list)

##### Extend the system from Problem 2 to perform simple TF-IDF scoring of the retrieved results. There is no need to worry about any weight normalizations.

In [None]:
# Term Frequency Calculation
# Reference: https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089

tf = {}

for i in range(len(filtered_tweets)):
    for w in filtered_tweets[i]:
      try:
          tf[w].add(i)
      except:
          tf[w] = {i}
for t in tf:
  tf[t]=len(tf[t])

tf

In [None]:
# Tf-idf

from collections import Counter
import numpy as np

tf_idf = {}

for i in range(len(filtered_tweets)):
    words = filtered_tweets[i]
    counter = Counter(words)
    words_count = len(words)
    for t in np.unique(words):
        tf = counter[t]/words_count
        df = tf[t]
        idf = np.log(len(tweets)/(df+1))
        tf_idf[doc_ids[i], t] = tf*idf

tf_idf