In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import base64
import numpy as np
from math import log
import os
import scipy
import gensim
import re
from copy import deepcopy
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import gensim.corpora as corpora
import itertools
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
stops = stopwords.words('english')

''' This method removes all kinds of line breaks. '''
def removeLineBreaks(tweet):
    return re.sub("\n\r|\r\n|\n|\r"," ", tweet)

''' This method removes all the url's in the tweet'''
def removeURLs(tweet):
    return re.sub("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", " ", tweet)

''' This method removes all emojis from the tweet'''
def removeEmojis(tweet):
    tweet = tweet.encode('ascii', 'ignore').decode('ascii')
    return tweet

''' This method checks if the tweet is a retweet or not.
    a retweet contains RT @***** '''
def isRetweet(tweet):
    retweet = re.compile("RT @[A-Za-z0-9]*:")
    retweet.match(tweet)

    return bool(re.search("RT @[A-Za-z0-9]*:", tweet))

''' This method removes the retweet tag from tweets'''
def removeRTtag(tweet):
    return re.sub("RT @[A-Za-z0-9]*: ", " ", tweet)

''' This method removes all the mentions.
    mentions are usually with @'''
def removeMentions(tweet):
    return re.sub("@[A-Za-z0-9]*", " ", tweet)

''' This method removes multiple spaces.'''
def removeMultipleSpaces(tweet):
    return re.sub(" +", " ", tweet)

''' This method turns the tweets into lowercase. '''
def lowercasetweet(tweet):
    return tweet.lower()

''' This method removes all the punctuations from the tweet.'''
def removePunctuations(tweet):
    return re.sub("[.,!'\";:?…]+", " ", tweet)

''' This method removes special characters from tweets.'''
def removeSpecialCharacters(tweet):
    return re.sub("[@#$%^*(){}\\\<>\[\]~/|=\+\-&_¿ߒ]+"," ", tweet)

''' This method removes alpha-numeric charcters from the tweet.'''
def removeAlphaNumeric(tweet):
    # return re.sub("[A-Za-z]+[0-9]+", "", tweet)
    return re.sub("[0-9]+", "", tweet)

''' Lemmatization using nltk. '''
def lemmatizeTweet(tweet):
    return [WordNetLemmatizer().lemmatize(token) for token in word_tokenize(tweet)]

def cleanData(text, lowercase = False, remove_stops = False, stemming = False, lemmatization = False):
    txt = str(text)

    # Replace apostrophes with standard lexicons
    txt = txt.replace("isn't", "is not")
    txt = txt.replace("aren't", "are not")
    txt = txt.replace("ain't", "am not")
    txt = txt.replace("won't", "will not")
    txt = txt.replace("didn't", "did not")
    txt = txt.replace("shan't", "shall not")
    txt = txt.replace("haven't", "have not")
    txt = txt.replace("hadn't", "had not")
    txt = txt.replace("hasn't", "has not")
    txt = txt.replace("don't", "do not")
    txt = txt.replace("wasn't", "was not")
    txt = txt.replace("weren't", "were not")
    txt = txt.replace("doesn't", "does not")
    txt = txt.replace("'s", " is")
    txt = txt.replace("'re", " are")
    txt = txt.replace("'m", " am")
    txt = txt.replace("'d", " would")
    txt = txt.replace("'ll", " will")

    # Emoji replacement
    txt = re.sub(r':\)',r' happy ',txt)
    txt = re.sub(r':D',r' happy ',txt)
    txt = re.sub(r':P',r' happy ',txt)
    txt = re.sub(r':\(',r' sad ',txt)

    # Replace words like sooooooo with so
    txt = ''.join(''.join(s)[:2] for _, s in itertools.groupby(txt))
    return txt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [3]:
import pickle
pickle_in = open("/content/gdrive/My Drive/IR Assignment/inverted_index.pickle","rb")
ii = pickle.load(pickle_in)
pickle_in = open("/content/gdrive/My Drive/IR Assignment/count_word.pickle","rb")
cw = pickle.load(pickle_in)
pickle_in = open("/content/gdrive/My Drive/IR Assignment/count_per_document.pickle","rb")
cd = pickle.load(pickle_in)
pickle_in = open("/content/gdrive/My Drive/IR Assignment/similarity_score.pickle","rb")
ss = pickle.load(pickle_in)
documents = np.load('/content/gdrive/My Drive/IR Assignment/IR_assignment.npy',allow_pickle=True)

In [4]:
search = input("Enter search query:")
query = []
ll = lemmatizeTweet(removeMultipleSpaces(removeURLs(removeMentions(removeEmojis(removeSpecialCharacters(removePunctuations(removeAlphaNumeric(cleanData(removeLineBreaks(search.lower()))))))))))
for word in ll:
  if word not in stops:
    query.append(word.lower())

Enter search query:I didn't really feel I was worth anyones time here today too


In [6]:
doc_score={}
for line in documents:
  doc_score[line[0]]=[]

In [7]:
for word in query:
  for similar_word in ss[word]:
    for docs in ii[similar_word[0]]:
      score = docs.split(':')
      doc_score[score[0]].append([-1 * similar_word[1] * log(cd[score[0]][similar_word[0]] / cw[similar_word[0]]),word,score[1]])

In [8]:
print(doc_score['1'])

[[7.7488913372555315, 'feel', '22'], [5.16696298892839, 'feel', '106'], [3.7596282297817676, 'feel', '59'], [3.7596282297817676, 'feel', '69'], [3.7596282297817676, 'feel', '77'], [3.7596282297817676, 'feel', '82'], [5.8805329864007, 'worth', '90'], [6.904750769961838, 'time', '102'], [6.904750769961838, 'time', '121']]


In [9]:
print(cd['1']['happy'])
doc_score['1'][0:cd['1']['happy']]

2


[[7.7488913372555315, 'feel', '22'], [5.16696298892839, 'feel', '106']]

In [10]:
score = {}
for key in doc_score:
  score[key]=0
  if len(doc_score[key])!=0:
    for index1 in range(len(doc_score[key])):
      score[key] += (doc_score[key][index1][0])
      index2 = index1+1
      while index2<len(doc_score[key]):
        if (doc_score[key][index1][1] != doc_score[key][index2][1]):
          d = abs(int(doc_score[key][index1][-1])-int(doc_score[key][index2][-1]))
          if d <= 9:
            score[key]+= (1-d/10)*(doc_score[key][index1][0])*(doc_score[key][index2][0])
        index2+=1

In [11]:
print(sorted(score.items(),key=lambda x:x[1],reverse = True)[:11])

[('1317', 801.8298714840205), ('1350', 750.1965114437847), ('1014', 632.8548861741922), ('376', 627.6751856655794), ('1522', 620.7945943764591), ('5282', 586.3396966862915), ('3786', 582.8185049118176), ('5152', 579.9301423300436), ('315', 555.7873558675733), ('5569', 554.4699753262745), ('5143', 550.0932483008615)]
