In [13]:
import pandas as pd
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
import nltk
import bisect
import pygtrie
import os

nltk.download('stopwords')
nltk.download('punkt')

In [17]:
#dataset
folder_path="C:\\Users\\Archana\\Documents\\SEM_7\\3_AIR\\TelevisionNews"
files= os.listdir(folder_path)
print(len(files))

418


In [3]:
file0=folder_path+"\\"+files[0]
df=pd.read_csv(file0)
print(df.columns)

Index(['URL', 'MatchDateTime', 'Station', 'Show', 'IAShowID', 'IAPreviewThumb',
       'Snippet'],
      dtype='object')


In [18]:
# function to remove stopwords
def remove_stopword(text):    
    stop_words = set(stopwords.words('english'))
    l=["not","no", "up","down","under","above","below","own","on","off","out","through","won","against","now","before","after"]    
    # retaining some stopwords
    for word in l:
        stop_words.remove(word)    
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    
    return(" ".join(filtered_sentence))


In [19]:
# function to remove special characters and punctuations
def clean_text(text):
    # decimals? 
    text=text.replace("\n"," ").replace("\r"," ")
    punclist='!"#$%&()*+,-./:;<=>?@[]\^_{}|~'
    t=str.maketrans(dict.fromkeys(punclist, " "))
    text=text.translate(t)
    t=str.maketrans(dict.fromkeys("'`",""))
    text=text.translate(t)
    
    return text
    

In [20]:
# lemmatization
def my_lemmatize(text):
    lemmatizer = WordNetLemmatizer() 
    text=text.split()
    l=[]
    for word in text:
        lword=lemmatizer.lemmatize(word)
        l.append(lword)
    
    return " ".join(l)




In [21]:
# wrapper function, called for each file
# returns a dataframe with new preprocessed column "Text" 
def pre_process(file_path,col="Snippet"):
    df=pd.read_csv(file_path)
    l=[]
    column=df[col]
    for row in column:
        l.append(my_lemmatize(remove_stopword(clean_text(row))))
    df["Text"]=l
    
    return df
    
        

In [22]:
# updating positional index
def update_trie(term,docid,pos,trie):
    if term in trie:
        if docid in trie[term]:
            bisect.insort(trie[term][docid],pos) #insert into sorted list of positions
        else:
            trie[term][docid]=[pos]
    else:
        trie[term]={docid:[pos]}

In [23]:
# function to create index for each file
# returns (normal index, reverse index)
# trie node: key,value pairs
#            key - <term>, value- {docId1: [pos1, pos2, pos3...], docId2: [pos1,pos2...]}
def index_construction(file_path):
    df=pre_process(file_path)
    corpus=df["Text"]

    #creating 2 tries
    index_trie=pygtrie.CharTrie()
    rev_trie=pygtrie.CharTrie()
    
    for i in range(len(corpus)):
        row=word_tokenize(corpus[i])
        for j in range(len(row)):
            update_trie(row[j],i,j,index_trie)            
            update_trie(row[j][::-1],i,j,rev_trie)
    
    return (index_trie,rev_trie)

In [25]:
# creating index for one file
index_trie=index_construction(folder_path+"\\"+files[0])
#print(index_trie[0]) # normal index
#print(index_trie[1]) # reverse index

In [15]:
# creating index for each file
all_indexes=[]
for i in files:
    try:
        all_indexes.append(index_construction(folder_path+"\\"+i))
    except:
        print(i) # file CNN.200910.csv is empty in the dataset
print("Done")

CNN.200910.csv


In [16]:
print(len(all_indexes))

417
