<b>Loading Libraries</b>

In [1]:
import time
import pprint
import glob
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import string
import pickle
import gensim
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import stem_text
from nltk.tokenize import word_tokenize

<b>Preprocessing and tokenizing function</b>

In [2]:
def preprocess(text):
    return [word for word in word_tokenize(stem_text(strip_punctuation(text.lower().replace("\n", "")))) if word not in ['lrb', 'rrb']]

<b>Initializing global variables </b>

In [7]:
pp = pprint.PrettyPrinter(indent=2)
#files = glob.glob('C:/Users/vallabh/Desktop/Web text/web search project/wiki-pages-text/*.txt')
#files = [file for file in files if file != '/home/rohan/Documents/Homework/WSTA/wiki-pages-text/to_test.txt']
files = ['/home/vallabh/Documents/wsta/wiki-pages-text/wiki-001.txt']
doc_dict = {}
file_dict = {}
df = pd.DataFrame()
stemmer = nltk.stem.PorterStemmer()
punc_table = str.maketrans({key: None for key in string.punctuation})

<b> Reading files into a dictionary </b>

In [8]:
start = time.time()
for file in files:
    with open(file, encoding = 'utf-8') as f:
        print(file)
        for line in f:
            try:
                #print(line)
                line = line.split(maxsplit=2)
                #print(line)
                key = line[0]
                value = line[2]
                if key in file_dict.keys():
                    file_dict[key] = file_dict[key] + value
                else:
                    file_dict[key] = value
                    #print(value)
            except Exception as e:
                print("ERROR",e, line)
        for key, values in file_dict.items():
            doc_dict[key] = values
        #print(file_dict)
print("Time taken: ", time.time() - start)

/home/vallabh/Documents/wsta/wiki-pages-text/wiki-001.txt
Time taken:  0.6422502994537354


<b> Loading into Dataframe </b>

In [9]:
df = pd.DataFrame(doc_dict.items(),columns=['doc_id','doc'])

<b> Pickle dataframe </b>

In [10]:
pkl_name = file.split('/')[-1].split(".")[0]
df.to_pickle("all_files" + ".pickle")

<b> Load pickled dataframe </b>

In [11]:
df = pd.read_pickle('all_files.pickle')

<b> Print dataframe and preprocess document</b>

In [12]:
print(df.head())
#print(df.head()['value'].tolist())
start = time.time()
df['doc_processed'] = df['doc'].apply(lambda x: preprocess(x))
print("Number of documents:", len(df))
print("Time taken:", time.time() - start)

                                     doc_id  \
0              1928_in_association_football   
1                           1986_NBA_Finals   
2     1901_Villanova_Wildcats_football_team   
3  1992_Northwestern_Wildcats_football_team   
4       1897_Princeton_Tigers_football_team   

                                                 doc  
0  The following are the football -LRB- soccer -R...  
1  The 1986 NBA Finals was the championship round...  
2  The 1901 Villanova Wildcats football team repr...  
3  The 1992 Northwestern Wildcats team represente...  
4  The 1897 Princeton Tigers football team repres...  
Number of documents: 48228
Time taken: 38.382105112075806


<b> Pickling preprocessed docs </b>

In [13]:
df

Unnamed: 0,doc_id,doc,doc_processed
0,1928_in_association_football,The following are the football -LRB- soccer -R...,"[the, follow, ar, the, footbal, soccer, event,..."
1,1986_NBA_Finals,The 1986 NBA Finals was the championship round...,"[the, 1986, nba, final, wa, the, championship,..."
2,1901_Villanova_Wildcats_football_team,The 1901 Villanova Wildcats football team repr...,"[the, 1901, villanova, wildcat, footbal, team,..."
3,1992_Northwestern_Wildcats_football_team,The 1992 Northwestern Wildcats team represente...,"[the, 1992, northwestern, wildcat, team, repre..."
4,1897_Princeton_Tigers_football_team,The 1897 Princeton Tigers football team repres...,"[the, 1897, princeton, tiger, footbal, team, r..."
5,1536_in_philosophy,1536 in philosophy\n,"[1536, in, philosophi]"
6,...Di_terra,... di terra is the eighth studio album by Ita...,"[di, terra, is, the, eighth, studio, album, by..."
7,1967–68_MJHL_season,"On March 14 , 1968 , at home in St. James , th...","[on, march, 14, 1968, at, home, in, st, jame, ..."
8,1998_All-Ireland_Senior_Hurling_Championship,The All-Ireland Senior Hurling Championship of...,"[the, all, ireland, senior, hurl, championship..."
9,1942_Pittsburgh_Steelers_season,The 1942 Pittsburgh Steelers season marked the...,"[the, 1942, pittsburgh, steeler, season, mark,..."


<b> Create Gensim dictionary </b>

In [14]:
start = time.time()
dictionary = gensim.corpora.Dictionary(df['doc_processed'].tolist())
print(dictionary)
print("Time taken:", time.time() - start)

Dictionary(60085 unique tokens: ['1928', 'ar', 'event', 'follow', 'footbal']...)
Time taken: 2.8859429359436035


<b> Creating BOW represention of documents </b>

In [15]:
start = time.time()
corpus = [dictionary.doc2bow(doc) for doc in df['doc_processed'].tolist()]
print(corpus[:5])
print("Time taken:", time.time() - start)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 4), (8, 1), (9, 1), (10, 1)], [(3, 1), (5, 4), (7, 32), (9, 1), (10, 2), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 3), (20, 1), (21, 1), (22, 1), (23, 2), (24, 6), (25, 1), (26, 2), (27, 3), (28, 3), (29, 1), (30, 1), (31, 1), (32, 1), (33, 2), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 4), (40, 2), (41, 4), (42, 1), (43, 1), (44, 2), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 11), (52, 1), (53, 1), (54, 1), (55, 1), (56, 5), (57, 3), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 4), (64, 3), (65, 3), (66, 1), (67, 4), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 12), (81, 1), (82, 1), (83, 1), (84, 3), (85, 1), (86, 1), (87, 1), (88, 4), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 4), (96, 1), (97, 1), (98, 1), (99, 4), (100, 3), (101, 2), (102, 1), (103, 2), (104, 1), (105, 1), (1

<b> Creating tf-idf model from this corpus </b>

In [16]:
start = time.time()
tf_idf = gensim.models.TfidfModel(corpus)
s = 0
for i in corpus:
    s += len(i)
print(s)
print("Time taken:", time.time() - start)

2017570
Time taken: 0.6155540943145752


<b> Creating Similarity index </b>

In [18]:
start = time.time()
sims = gensim.similarities.Similarity('/home/vallabh/Documents/wsta/work-dir',tf_idf[corpus], num_features=len(dictionary))
print("Time taken: ", time.time() - start)

Time taken:  30.320961952209473


<b> Getting query and showing documents related to the query </b>

In [48]:
query = input("Enter statement: ")
start = time.time()
query_doc = preprocess(query)
query_doc_bow = dictionary.doc2bow(query_doc)
query_doc_tf_idf = tf_idf[query_doc_bow]
print(query_doc_tf_idf)
sim_list = sims[query_doc_tf_idf]
df['similarity_score'] = sim_list
print(type(sim_list), len(sim_list))
print(df.nlargest(5, ['similarity_score']))
#print("Position of most relevant doc: ", max_index)
print("Time taken:", time.time() - start)

Enter statement: R. Kelly created an audio work
[(196, 0.45636315809861067), (208, 0.13994694389163598), (241, 0.34749265728627055), (398, 0.3526760228562367), (3892, 0.38618578910750584), (16340, 0.6147166283349593)]
<class 'numpy.ndarray'> 48228
                                      doc_id  \
6603                  12_Nights_of_Christmas   
2949                                 12_Play   
22981  1985_Washington_Huskies_football_team   
3163                          100%_Kelly_Key   
36043          1989_Singapore_Open_–_Singles   

                                                     doc  \
6603   12 Nights of Christmas is the first holiday an...   
2949   12 Play is the debut studio album by American ...   
22981  The 1985 Washington Huskies football team was ...   
3163   100 % Kelly Key is a compilation album by Braz...   
36043  Kelly Jones won in the final 6 -- 1 , 7 -- 5 a...   

                                           doc_processed  similarity_score  
6603   [12, night, of, ch

In [50]:
df['doc'][2949]

"12 Play is the debut studio album by American R&B and soul musician R. Kelly ; it was released on November 9 , 1993 , by Jive Records .\nThe album follows his tenure with R&B group Public Announcement , with whom he released one album , Born into the 90 's -LRB- 1992 -RRB- .\nIt went on to top the R&B albums chart for nine weeks straight , while reaching the second position on the US Billboard 200 chart .\nThe album features the sexually-themed singles `` Bump n ' Grind '' -LRB- US ; number 1 -RRB- , `` Your Body 's Callin ' '' -LRB- US ; number 13 -RRB- , and the more overtly direct `` Sex Me , Pts .\n1 & 2 '' -LRB- US ; number 20 -RRB- .\nThe album serves as the first of a trilogy of albums Kelly later released under the `` 12 Play '' moniker including TP-2 .\ncom -LRB- 2000 -RRB- and TP-3 : Reloaded -LRB- 2005 -RRB- .\nSince its initial mixed response from critics , 12 Play has received more favorable retrospective criticism .\nThis album earned Kelly the title King of R&B .\n"

In [20]:
import json
train_file = json.load(open('/home/vallabh/Documents/wsta/train.json'))
    

In [24]:
d_train = {}
for key in train_file.keys():
    if len(train_file[key]['evidence']) != 0:
        for sub_list in train_file[key]['evidence']:
            for sub_sub_list in sub_list:
                if sub_sub_list in doc_dict.keys():
                    d_train[sub_sub_list] = train_file[key]
                       

In [26]:
pprint.pprint(d_train)

{'"Heroes"_-LRB-David_Bowie_album-RRB-': {'claim': 'David Bowie has an album '
                                                   'called Heroes released in '
                                                   '1977.',
                                          'evidence': [['"Heroes"_-LRB-David_Bowie_album-RRB-',
                                                        0],
                                                       ['David_Bowie', 16]],
                                          'label': 'SUPPORTS'},
 "'Til_Death": {'claim': "Krysten Ritter was on the show 'Til Death from "
                         '1998-1999.',
                'evidence': [['Krysten_Ritter', 3], ["'Til_Death", 0]],
                'label': 'REFUTES'},
 '1000_Ways_to_Die': {'claim': 'Ron Perlman died April 13, 1950.',
                      'evidence': [['Ron_Perlman', 5],
                                   ['Ron_Perlman', 4],
                                   ['Blade_II', 0],
                                

TypeError: 'in <string>' requires string as left operand, not list

In [78]:
train_json = json.load(open('C:/Users/vallabh/Desktop/Web text/web search project/train.json'))

In [79]:
type(train_json)

dict

{'claim': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.',
 'label': 'SUPPORTS',
 'evidence': [['Fox_Broadcasting_Company', 0], ['Nikolaj_Coster-Waldau', 7]]}