In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import pandas as pd
import base64
import numpy as np
# import imageio
import os
import scipy
import gensim
import re
from copy import deepcopy
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import gensim.corpora as corpora
import itertools
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
stops = stopwords.words('english')

''' This method removes all kinds of line breaks. '''
def removeLineBreaks(tweet):
    return re.sub("\n\r|\r\n|\n|\r"," ", tweet)

''' This method removes all the url's in the tweet'''
def removeURLs(tweet):
    return re.sub("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", " ", tweet)

''' This method removes all emojis from the tweet'''
def removeEmojis(tweet):
    tweet = tweet.encode('ascii', 'ignore').decode('ascii')
    return tweet

''' This method checks if the tweet is a retweet or not.
    a retweet contains RT @***** '''
def isRetweet(tweet):
    retweet = re.compile("RT @[A-Za-z0-9]*:")
    retweet.match(tweet)

    return bool(re.search("RT @[A-Za-z0-9]*:", tweet))

''' This method removes the retweet tag from tweets'''
def removeRTtag(tweet):
    return re.sub("RT @[A-Za-z0-9]*: ", " ", tweet)

''' This method removes all the mentions.
    mentions are usually with @'''
def removeMentions(tweet):
    return re.sub("@[A-Za-z0-9]*", " ", tweet)

''' This method removes multiple spaces.'''
def removeMultipleSpaces(tweet):
    return re.sub(" +", " ", tweet)

''' This method turns the tweets into lowercase. '''
def lowercasetweet(tweet):
    return tweet.lower()

''' This method removes all the punctuations from the tweet.'''
def removePunctuations(tweet):
    return re.sub("[.,!'\";:?…]+", " ", tweet)

''' This method removes special characters from tweets.'''
def removeSpecialCharacters(tweet):
    return re.sub("[@#$%^*(){}\\\<>\[\]~/|=\+\-&_¿ߒ]+"," ", tweet)

''' This method removes alpha-numeric charcters from the tweet.'''
def removeAlphaNumeric(tweet):
    # return re.sub("[A-Za-z]+[0-9]+", "", tweet)
    return re.sub("[0-9]+", "", tweet)

''' Lemmatization using nltk. '''
def lemmatizeTweet(tweet):
    return [WordNetLemmatizer().lemmatize(token) for token in word_tokenize(tweet)]

def cleanData(text, lowercase = False, remove_stops = False, stemming = False, lemmatization = False):
    txt = str(text)

    # Replace apostrophes with standard lexicons
    txt = txt.replace("isn't", "is not")
    txt = txt.replace("aren't", "are not")
    txt = txt.replace("ain't", "am not")
    txt = txt.replace("won't", "will not")
    txt = txt.replace("didn't", "did not")
    txt = txt.replace("shan't", "shall not")
    txt = txt.replace("haven't", "have not")
    txt = txt.replace("hadn't", "had not")
    txt = txt.replace("hasn't", "has not")
    txt = txt.replace("don't", "do not")
    txt = txt.replace("wasn't", "was not")
    txt = txt.replace("weren't", "were not")
    txt = txt.replace("doesn't", "does not")
    txt = txt.replace("'s", " is")
    txt = txt.replace("'re", " are")
    txt = txt.replace("'m", " am")
    txt = txt.replace("'d", " would")
    txt = txt.replace("'ll", " will")

    # Emoji replacement
    txt = re.sub(r':\)',r' happy ',txt)
    txt = re.sub(r':D',r' happy ',txt)
    txt = re.sub(r':P',r' happy ',txt)
    txt = re.sub(r':\(',r' sad ',txt)

    # Replace words like sooooooo with so
    txt = ''.join(''.join(s)[:2] for _, s in itertools.groupby(txt))
    return txt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
final = np.load('/content/gdrive/My Drive/IR Assignment/IR_assignment.npy',allow_pickle=True)
uniq_words=[]
count={}
import os
import pickle
cd={}
index={}
for line in final:
    count_document={}
    cnt=0
    ll = lemmatizeTweet(removeMultipleSpaces(removeURLs(removeMentions(removeEmojis(removeSpecialCharacters(removePunctuations(removeAlphaNumeric(cleanData(removeLineBreaks(line[1].lower()))))))))))
    for word in ll:
        if word not in stops:
          cnt+=1
          try:
            count_document[word]+=1
          except:
            count_document[word]=1
          if word not in uniq_words:
            uniq_words.append(word)
            count[word]=1
            index[word]=[]
          else:
            count[word]+=1
          if str(line[0]+':'+str(cnt)) not in index[word]:
             index[word].append(str(line[0]+':'+str(cnt)))
    cd[line[0]]=deepcopy(count_document)

In [5]:
print(count['happy'])
print(index['party'])
print(cd['1']['sorry'])
print(len(uniq_words))

838
['1:14', '27:12', '126:161', '225:121', '344:39', '381:9', '436:86', '475:106', '475:108', '475:130', '475:143', '513:187', '513:191', '516:62', '561:33', '737:71', '752:13', '837:23', '986:38', '1164:20', '1200:9', '1497:23', '1863:73', '1863:434', '2202:128', '2317:7', '2317:14', '2492:22', '2673:5', '2773:57', '2920:77', '2990:8', '3026:70', '3039:57', '3203:7', '3223:305', '3909:15', '3909:54', '3942:30', '3960:46', '3996:30', '4065:71', '4065:160', '4065:183', '4278:30', '4278:36', '4506:25', '4527:16', '5020:48', '5159:13', '5210:146', '5268:78', '5371:199', '5638:566', '5638:793', '5638:797', '6158:93', '6158:142', '6169:36']
1
14337


In [6]:
print(final[0])

['1'
 'I\'m sorry to hear about the state you\'re in. Thank you for opening up.\n\nA few personal thoughts:\n\nre: college - if you have this perception of college, you\'ll do well in life :) While the party atmosphere is the most "visible" part of any campus, I can guarantee that there\'s people around you who feel the same way about it as you do. Try to connect with them somehow. You\'re not alone, I guarantee it. Life gets better once you\'re done college, however enjoy it while you\'re there. If your courseload is too heavy, try to even it out, and consider taking summer courses - you might not finish as fast as you\'d like, however you\'ll still finish faster than normal. Take electives you\'re interested in and enjoy.\n\nre: relationships - I know it isn\'t easy, however I usually try to tell people that a good number of people that I know that are in relationships wish they were single, and a good number of single people that I know wish they were in relationships - no matter wh

In [7]:

pickle_out = open("/content/gdrive/My Drive/IR Assignment/inverted_index.pickle","wb")
pickle.dump(index, pickle_out)
pickle_out.close()
pickle_out = open("/content/gdrive/My Drive/IR Assignment/count_word.pickle","wb")
pickle.dump(count, pickle_out)
pickle_out.close()
np.save("/content/gdrive/My Drive/IR Assignment/uniq_words.npy",uniq_words)
pickle_out = open("/content/gdrive/My Drive/IR Assignment/count_per_document.pickle","wb")
pickle.dump(cd, pickle_out)
pickle_out.close()