<a href="https://colab.research.google.com/github/jaimebaldeon/RAI/blob/master/infoRetrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Extract files from ZIP and save their names**

In [None]:
# importing required modules 
from zipfile import ZipFile 
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# specifying the zip file name 
file_name = "Ficheros(html).zip"

# opening the zip file in READ mode 
with ZipFile(file_name, 'r') as zip: 
	# extracting the names of the files in the zip
	file_names = zip.namelist()
	# extracting all the files 
	print('Extracting all the files now...') 
	zip.extractall() 
	print('Done!') 


Extracting all the files now...
Done!


**Read files and clean data**

In [None]:
clean_data = []
print('Cleaning raw data...')
for file_name in file_names:
  with open(file_name, 'r') as file:
      rawdata = file.read().replace('\n', '')
      clean_script = re.compile('<script.*?</script>')
      clean_script_data = re.sub(clean_script, '', rawdata)
      clean_htmltags = re.compile('<.*?>')
      clean_htmltags_data = re.sub(clean_htmltags, ' ', clean_script_data)
      clean_data.append(re.sub('\s+',' ',clean_htmltags_data))
print('Done!')


Cleaning raw data...
Done!


**Create corpus with all the files**

In [None]:
corpus = clean_data
corpus

[' Gerard Salton: Facts, Discussion Forum, and Encyclopedia Article Home &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Discussion &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Topics &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Dictionary &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Almanac Signup &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Login Gerard Salton Gerard Salton Topic Home Discussion Discussion Ask a question about \' Gerard Salton \' Start a new discussion about \' Gerard Salton \' Answer questions from other users Full Discussion Forum &nbsp; Encyclopedia Gerard Salton (8 March, 1927&nbsp;in Nuremberg Nuremberg Nuremberg is a city in the German state of Bavaria, in the administrative region of Middle Franconia. It is situated on the Pegnitz river and the Rhine-Main-Danube Canal and is Franconia\'s largest city. It is located about 170 kilometres north of Munich, at 49.27° N 11.5° E. The population is... &nbsp;- 28 August, 1995) was a Professor of Computer Science Computer science Computer science is the study of the theoretical foundat

**Initialize CountVectorizer and tokenize corpus**

In [None]:
vectorizer = CountVectorizer()
# tokenization
matriz_tf = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()

In [None]:
# Matrix with token occurrences 
matriz_tf.toarray()

array([[0, 4, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 2],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0]])

In [None]:
# Analysis of corpus documents
analyze = vectorizer.build_analyzer()
for documento in corpus: print(analyze(documento))

['gerard', 'salton', 'facts', 'discussion', 'forum', 'and', 'encyclopedia', 'article', 'home', 'nbsp', 'nbsp', 'nbsp', 'nbsp', 'nbsp', 'discussion', 'nbsp', 'nbsp', 'nbsp', 'nbsp', 'nbsp', 'topics', 'nbsp', 'nbsp', 'nbsp', 'nbsp', 'nbsp', 'dictionary', 'nbsp', 'nbsp', 'nbsp', 'nbsp', 'nbsp', 'almanac', 'signup', 'nbsp', 'nbsp', 'nbsp', 'nbsp', 'nbsp', 'nbsp', 'login', 'gerard', 'salton', 'gerard', 'salton', 'topic', 'home', 'discussion', 'discussion', 'ask', 'question', 'about', 'gerard', 'salton', 'start', 'new', 'discussion', 'about', 'gerard', 'salton', 'answer', 'questions', 'from', 'other', 'users', 'full', 'discussion', 'forum', 'nbsp', 'encyclopedia', 'gerard', 'salton', 'march', '1927', 'nbsp', 'in', 'nuremberg', 'nuremberg', 'nuremberg', 'is', 'city', 'in', 'the', 'german', 'state', 'of', 'bavaria', 'in', 'the', 'administrative', 'region', 'of', 'middle', 'franconia', 'it', 'is', 'situated', 'on', 'the', 'pegnitz', 'river', 'and', 'the', 'rhine', 'main', 'danube', 'canal', 'an

In [None]:
# Create bigram vectorizer (more semantic information)
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), 
                                    token_pattern=r'\b\w+\b', min_df=1)
analyze = bigram_vectorizer.build_analyzer()

In [None]:
bmatriz_tf = bigram_vectorizer.fit_transform(corpus)
bigram_vectorizer.get_feature_names()

**Create queries and tokenize**

In [None]:
query = [
    "What video game won Spike's best driving game award in 2006?"
]
query

["What video game won Spike's best driving game award in 2006?"]

In [None]:
query_tf = vectorizer.transform(query)
bquery_tf = bigram_vectorizer.transform(query)
query_tf.toarray()

array([[0, 0, 0, ..., 0, 0, 0]])

**Analyze query similarity with the documents**

In [None]:
# Similarity with Scalar Product TF

num_files = matriz_tf.get_shape()[0]
q = query_tf.toarray().flatten()
scalar_prod_TF = []
for i in range(num_files):
  doc = matriz_tf.getrow(i).toarray().flatten()
  scalar_prod_TF.append(q @ doc )
scalar_prod_TF

[32, 4, 265, 7, 85]

In [None]:
# Similarity with Scalar Product TF

num_files = bmatriz_tf.get_shape()[0]
q = bquery_tf.toarray().flatten()
scalar_prod_TF = []
for i in range(num_files):
  doc = bmatriz_tf.getrow(i).toarray().flatten()
  scalar_prod_TF.append(q @ doc )
scalar_prod_TF

[40, 9, 315, 8, 110]

In [None]:
# Similarity with Cosine TF

print(cosine_similarity(query_tf, matriz_tf))
cosine_similarity(bquery_tf, bmatriz_tf)

[[0.07960544 0.01758297 0.25982465 0.04103896 0.13234642]]


array([[0.07709992, 0.02838665, 0.2469478 , 0.03446707, 0.13051428]])

**Initialize TfidfVectorizer (different token weights) and tokenize corpus**


In [None]:
tfidf_vectorizer = TfidfVectorizer()
matriz_tfidf = tfidf_vectorizer.fit_transform(corpus)
# Token weights in each document
matriz_tfidf.toarray()

array([[0.        , 0.04673153, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.02424865,
        0.0484973 ],
       [0.        , 0.        , 0.0057586 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.02683986, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00777997, 0.        , 0.        , ..., 0.00777997, 0.        ,
        0.        ]])

In [None]:
tfidf_bigram_vectorizer = TfidfVectorizer(ngram_range=(1, 2), 
                                    token_pattern=r'\b\w+\b', min_df=1)
bmatriz_tfidf = tfidf_bigram_vectorizer.fit_transform(corpus)

In [None]:
# Total tokens' weight
tfidf_vectorizer.idf_

array([2.09861229, 1.69314718, 2.09861229, ..., 2.09861229, 2.09861229,
       2.09861229])

In [None]:
tfidf_vectorizer.get_feature_names()

**Transform query into vector**

In [None]:
query_tfidf = tfidf_vectorizer.transform(query)
bquery_tfidf = tfidf_bigram_vectorizer.transform(query)
query_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

**Analyze similarity between query and documents**

In [None]:
# Similarity with Scalar Product TF IDF
num_files = matriz_tfidf.get_shape()[0]

# Tf*idf computation of words of the query 
q_tfidf = query_tf.toarray().flatten() * tfidf_vectorizer.idf_.flatten()  

scalar_prod_TFIDF = []
for i in range(num_files):
  doc_tfidf = matriz_tf.getrow(i).toarray().flatten() * tfidf_vectorizer.idf_.flatten()
  scalar_prod_TFIDF.append(q_tfidf @ doc_tfidf )
scalar_prod_TFIDF

[37.672429377866834,
 4.0,
 674.5467239631978,
 13.150606405687153,
 163.48554785416889]

In [None]:
# Similarity with Scalar Product TF IDF
num_files = bmatriz_tfidf.get_shape()[0]

# Tf*idf computation of words of the query 
q_tfidf = bquery_tf.toarray().flatten() * tfidf_bigram_vectorizer.idf_.flatten()  

scalar_prod_TFIDF = []
for i in range(num_files):
  doc_tfidf = bmatriz_tf.getrow(i).toarray().flatten() * tfidf_bigram_vectorizer.idf_.flatten()
  scalar_prod_TFIDF.append(q_tfidf @ doc_tfidf )
scalar_prod_TFIDF

[45.672429377866834,
 9.0,
 830.0761036458107,
 14.150606405687153,
 188.4855478541689]

In [None]:
# Similarity with Cosine TF IDF

print(cosine_similarity(query_tfidf, matriz_tfidf))
cosine_similarity(bquery_tfidf, bmatriz_tfidf)

[[0.04379845 0.00778745 0.31187222 0.03512459 0.10211862]]


array([[0.03677348, 0.0114982 , 0.28285294, 0.02418815, 0.08175035]])