<a href="https://colab.research.google.com/github/jhgogoi/information_retrieval/blob/main/Information_Retrieval_CDAC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Information Retrieval

In [None]:
import os
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def read_content(file_path):
        try:
            with open(file_path, "r", encoding = 'utf8') as content:
                return content.read()
        except FileNotFoundError:
            return f"File not found: {file_path}"
        except PermissionError:
            return f"Permission error reading: {file_path}"

def QueryExpansion(Q):
    Qt= Q.split()
    Expan=[]
    for qt in Qt:
        for syn in wn.synsets(qt):
            Expan.extend(syn.lemma_names())
    return " " .join(list(set(Expan)))

def tokenize_content(content):
    return word_tokenize(content.lower())


def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words and word.isalpha()]


def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]


def Stemming(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(plural) for plural in tokens]


def preprocess(file_content):
    tokens = tokenize_content(file_content)
    filtered_tokens = remove_stopwords(tokens)
    lemmatized_tokens = lemmatize_tokens(filtered_tokens)
    Stem_tokens = Stemming(lemmatized_tokens)
    return " ".join(Stem_tokens)

# Print the original tokens, filtered tokens, and lemmatized tokens of the second file in the lists


In [None]:
Folder = "D:\\Bunty\\CDAC\\Project"
Contents = []
FilePaths = []
Tokens = []
FilteredTokens = []
LemmatizedTokens = []


for root, dirs, files in os.walk(Folder):
    for file in files:
        file_path = os.path.join(root, file)
        FilePaths.append(file_path)
        file_content = read_content(file_path)
        filter_token = preprocess(file_content)
        Contents.append(filter_token)


#print("File Path:", FilePaths)
#print("\nContent:", Contents)

# Print the file path, content.



In [None]:
query = input("Enter your Query:")
print("Original Query:", query)
query=QueryExpansion(query)
query=preprocess(query)


print("Expanded Preprocessed Query:", query)



Enter your Query:startup investor capital
Original Query: startup investor capital
Expanded Preprocessed Query: uppercas startup capit washington chapit cap inaugur great capit investor majuscul


In [None]:
vectorizer = CountVectorizer()

vectors2 = vectorizer.fit_transform(Contents+[query])

cosine_similarities1 = cosine_similarity(vectors2[-1], vectors2[:-1])

most_similar_index = cosine_similarities1.argmax()

most_similar_document = f"Document {most_similar_index + 1}:{FilePaths[most_similar_index]}" # f - for what?
print("Most Similar Document:")
print(most_similar_document)


Most Similar Document:
Document 21:D:\Bunty\CDAC\Project\Finance\Finance B\2 Corporate Finance.txt


In [None]:
trainset = [query]+ Contents

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix_train = tfidf_vectorizer.fit_transform(trainset)


print("cos sim:",cosine_similarity(tfidf_matrix_train[0:1],tfidf_matrix_train[1:]))


cos sim: [[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.043993   0.         0.12397933 0.        ]]


In [None]:
cosine_similarities2 = cosine_similarity(tfidf_matrix_train[0:1],tfidf_matrix_train[1:])

most_similar_index = cosine_similarities2.argmax()

most_similar_document = f"Document {most_similar_index + 1}:{FilePaths[most_similar_index]}"
print("Most Similar Document:")
print(most_similar_document)
print(cosine_similarities2[0][most_similar_index])

#print(Contents[most_similar_index])

text_main = Contents[most_similar_index]
highlight_list = query.split()
highlight_str = r"\b(?:" + '|'.join(highlight_list) + r")\b"
text_highlight = re.sub(highlight_str, '\033[44;33m\g<0>\033[m', text_main)
print(text_highlight)


Most Similar Document:
Document 21:D:\Bunty\CDAC\Project\Finance\Finance B\2 Corporate Finance.txt
0.1239793330576185
corpor financ corpor financ refer financi activ relat run corpor divis depart usual set overse financi activ exampl larg compani may decid whether rais addit fund bond issu stock offer invest bank may advis firm consider help market secur [44;33mstartup[m may receiv [44;33mcapit[m angel [44;33minvestor[m ventur capitalist exchang percentag ownership compani thrive decid go public issu share stock exchang initi public offer ipo rais cash case budget [44;33mcapit[m properli effect compani growth goal may need decid project financ put hold type decis fall corpor financ
