<a href="https://colab.research.google.com/github/isaacsgabe/School/blob/main/NLP/information_retrival.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
file_path = 'https://github.com/rosenfa/ai/blob/master/news-israel.xlsx?raw=true'
file_path_pal = 'https://github.com/rosenfa/ai/blob/master/news-pal.xlsx?raw=true'
df_israel = pd.read_excel(file_path)
df_pal = pd.read_excel(file_path_pal)
content_pal = df_pal['Content']
content_isr = df_israel['Content']
combined_content = pd.concat([content_pal, content_isr], ignore_index=True)

In [None]:
import re

# Define a function to remove non-alphanumeric characters
def remove_special_characters(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

# Apply the remove_special_characters function to all content in the combined DataFrame
cleaned_content = combined_content.apply(remove_special_characters)
cleaned_content_df = cleaned_content.to_frame()

In [None]:
cleaned_content_df

Unnamed: 0,Content
0,Qatar the mediator in the Gaza truce talks ha...
1,Hamas says it holds US President Joe Biden and...
2,Israeli attacks in the besieged Gaza Strip amo...
3,Israeli warplanes and artillery bomb several a...
4,Israel respects no international laws or conve...
...,...
945,White nationalists usually antiMuslim bigots a...
946,Why Israel agreed to the hostage deal and how ...
947,With the rise of campus antisemitism around th...
948,WZO offers mobile showers for IDF soldiers in ...


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords


documents = cleaned_content_df['Content']

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer to the documents and transform the documents into TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get the vocabulary (terms)
terms = tfidf_vectorizer.get_feature_names_out()

# Sum up the TF-IDF scores for each term across all documents
tfidf_scores = tfidf_matrix.sum(axis=0).A1

# Create a dictionary mapping terms to their TF-IDF scores
term_tfidf_scores = dict(zip(terms, tfidf_scores))

# Sort terms by their TF-IDF scores in descending order
sorted_terms_by_tfidf = sorted(term_tfidf_scores.items(), key=lambda x: x[1], reverse=True)

# Print the top 10 most popular terms based on TF-IDF scores
print("Top 10 most popular terms based on TF-IDF scores:")
for term, score in sorted_terms_by_tfidf[:10]:
    print(f"{term}: {score}")

# Get standard English stopwords
standard_stopwords = set(stopwords.words('english'))

# Compare most popular terms with standard stopwords
common_stopwords = set([term for term, _ in sorted_terms_by_tfidf[:10]]) & standard_stopwords

print("\nCommon stopwords in most popular terms and standard English stopwords:")
print(common_stopwords)


Top 10 most popular terms based on TF-IDF scores:
the: 73.43963133807459
in: 54.15357233642159
of: 47.98256478749286
to: 44.89618487851124
gaza: 36.48897858988237
israeli: 33.222692420766094
and: 32.996486353376966
on: 28.314743164390812
israel: 26.083059190954355
for: 22.20100574446302

Common stopwords in most popular terms and standard English stopwords:
{'on', 'for', 'the', 'of', 'to', 'in', 'and'}


In [None]:
# Search word
search_word = "jazeera"  # Change this to the word you want to search for

# Get the index of the search word in the vocabulary
search_word_index = tfidf_vectorizer.vocabulary_.get(search_word)

# If search word not found in vocabulary, return None
if search_word_index is None:
    print(f"The word '{search_word}' is not found in the documents.")
else:
    # Get TF-IDF scores for the search word across all documents
    search_word_tfidf_scores = tfidf_matrix[:, search_word_index].toarray().flatten()

    # Get indices of documents sorted by TF-IDF score of search word
    sorted_indices = search_word_tfidf_scores.argsort()[::-1]

    # Get TF-IDF scores and documents for the top 5 highest-ranked documents
    top_5_documents = []
    top_5_tfidf_scores = []
    for i in range(5):
        document_index = sorted_indices[i]
        tfidf_score = search_word_tfidf_scores[document_index]
        document = documents[document_index]
        top_5_documents.append(document)
        top_5_tfidf_scores.append(tfidf_score)

    # Print the top 5 highest-ranked documents and their TF-IDF scores
    print(f"Top 5 documents containing the word '{search_word}':")
    for i, (document, tfidf_score) in enumerate(zip(top_5_documents, top_5_tfidf_scores), 1):
        print(f"Document {i}:")
        print(document)
        print(f"TF-IDF Score: {tfidf_score}\n")

Top 5 documents containing the word 'jazeera':
Document 1:
BREAKING Dozens of casualties in an Israeli bombardment targeting the Indonesian Hospital in northern Gaza according to Al Jazeera The hospital has been besieged by Israeli tanks for over three days
TF-IDF Score: 0.223597299954548

Document 2:
Al Jazeera correspondent reports that the number of martyred journalists since the start of the Israeli war on the Gaza Strip has risen to 60Israel
TF-IDF Score: 0.22285080932138243

Document 3:
Magen Inons parents were killed on October 7 by Hamas He talks to Al Jazeera about their loss and trying to create hope among communities
TF-IDF Score: 0.20705054768866316

Document 4:
What do Israelis and Palestinians think about Israels war on Gaza Al Jazeera producer Nils Adler travelled from Eilat to occupied East Jerusalem to find out
TF-IDF Score: 0.19185665591803577

Document 5:
Al Jazeera talks to the president of the Committee to Protect Journalists Jodie Ginsberg on Israels alleged delib

In [None]:
!pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [None]:
from rank_bm25 import BM25Okapi

def search_documents_bm25(documents, search_terms):
    # Tokenize search terms
    search_tokens = search_terms.split()

    # Tokenize documents
    tokenized_documents = [document.split() for document in documents]

    # Initialize BM25 object
    bm25 = BM25Okapi(tokenized_documents)

    # Get document scores using BM25 for the search terms
    scores = bm25.get_scores(search_tokens)

    # Sort documents by their scores in descending order
    sorted_documents = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)

    # Get the top 5 highest-ranked documents
    top_5_documents = []
    top_5_scores = []
    for doc_idx, score in sorted_documents[:5]:
        top_5_documents.append(documents[doc_idx])
        top_5_scores.append(score)

    return top_5_documents, top_5_scores

# Search terms with multiple values
search_terms = "Is israel occupyng gaza"

# Search for the terms in the documents using BM25
top_5_documents, top_5_scores = search_documents_bm25(documents, search_terms)

# Print the top 5 highest-ranked documents and their scores
print(f"Top 5 documents matching the search terms '{search_terms}' using BM25:")
for i, (document, score) in enumerate(zip(top_5_documents, top_5_scores), 1):
    print(f"Document {i}:")
    print(document)
    print(f"BM25 Score: {score}\n")


Top 5 documents matching the search terms 'Is israel occupyng gaza' using BM25:
Document 1:
Is AI deciding who s and dies in Gaza Israel is using itsFire Factory algorithm in the war on Gaza raising questions about ethics and human rightsAl Jazeeras MissSamJohnson explores in Digital Dilemma
BM25 Score: 5.289417681670379

Document 2:
Is there any chance for the development of a true dialogue between Israelis and Palestinians or for that matter is there any chance for the development of a true dialogue within Israeli society with all its schisms and divides  opinion
BM25 Score: 4.772947145236903

Document 3:
 Qatar the mediator in the Gaza truce talks has confirmed that a humanitarian pause has been agreed on
BM25 Score: 0.0

Document 4:
Hamas says it holds US President Joe Biden and his administration directly responsible for the ethnic cleansing committed by Israeli forces at Gazas alShifa Hospital  s
BM25 Score: 0.0

Document 5:
Israeli attacks in the besieged Gaza Strip amount towar

In [None]:
import numpy as np
import gensim.downloader

def preprocess_text(text):
    return text.lower().split()

def compute_embedding(tokens, model):
    embeddings = [model[token] for token in tokens if token in model]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros_like(model[next(iter(model.vocab))])

def search_documents(texts, search_terms, model):
    search_tokens = preprocess_text(search_terms)
    document_embeddings = [compute_embedding(preprocess_text(text), model) for text in texts]
    search_embedding = compute_embedding(search_tokens, model)
    similarities = [np.dot(search_embedding, doc_embedding) / (np.linalg.norm(search_embedding) * np.linalg.norm(doc_embedding)) for doc_embedding in document_embeddings]
    sorted_documents = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
    top_documents = [texts[idx] for idx, _ in sorted_documents[:5]]
    top_scores = [score for _, score in sorted_documents[:5]]
    return top_documents, top_scores

# glove_model = gensim.downloader.load('glove-wiki-gigaword-300')
documents = cleaned_content_df['Content'].tolist()
search_terms = "Al Jazeera is bad"
top_documents, top_scores = search_documents(documents, search_terms, glove_model)

print(f"Top 5 documents matching the search terms '{search_terms}':")
for i, (document, score) in enumerate(zip(top_documents, top_scores), 1):
    print(f"Document {i}:")
    print(document)
    print(f"Similarity Score: {score}\n")


Top 5 documents matching the search terms 'Al Jazeera is bad':
Document 1:
There were a lot of files against my organisation against me as an individualMohammed Zubair cofounder of Alt News a factchecking website in India tells Al Jazeera about the backlash he faced debunking misinformation from proIsrael accounts
Similarity Score: 0.7099060416221619

Document 2:
The EUs Foreign Policy Chief Josep Borrell faces social media backlash after a viral Al Jazeera Arabic interview He labels Hamas Oct 7 attack on Israel a war crime but when questioned about Israels attacks on Gaza he responds Im not a lawyer
Similarity Score: 0.6918182373046875

Document 3:
26This one wasnt just pushed by social media but by Qatars Hamassupporting Al Jazeera network which over the weekend took one halfsentence from a Haaretz report and distorted it to feed the theory
Similarity Score: 0.6917501091957092

Document 4:
Dr Ghassan Abu Sittah who worked at both Gazas AlShifa and AlAlhi Baptist hospitals tells Al Ja

**Link for Bert: https://colab.research.google.com/drive/1W2AV5kFGihegJNnZWUBdWo-3gKfG1KqO?usp=sharing**