<a href="https://colab.research.google.com/github/ibrahimalish/ibrahimalish/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np

# Function to clean text
def clean_text(text):
    #  text to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text


data = pd.read_csv('sample.csv')

# Selecting a smaller subset of data for testing
data_subset = data[['ARTICLE_ID', 'SECTION_TEXT']].head(100)

#PRE-PROCESSING
# Droping rows with missing values
data_subset.dropna(inplace=True)


def tokenize_text(text):
    return text.strip().split()

# Tokenizing SECTION_TEXT for each row
for index, row in data_subset.iterrows():
    tokenized_text = tokenize_text(row['SECTION_TEXT'])
    print(f"Article ID: {row['ARTICLE_ID']}, Tokenized Text: {tokenized_text}")

# Generating unique IDs for every word
word_ids = {}
next_word_id = 0

for _, group in data_subset.groupby('ARTICLE_ID'):
    for section_text in group['SECTION_TEXT']:
        tokens = tokenize_text(section_text)
        for word in tokens:
            if word not in word_ids:
                word_ids[word] = next_word_id
                next_word_id += 1


for word, word_id in word_ids.items():
    print(f"Word: {word}, Word ID: {word_id}")

# Computing word frequencies for each article
article_word_frequencies = {}

for article_id, group in data_subset.groupby('ARTICLE_ID'):
    word_frequencies = {}
    for section_text in group['SECTION_TEXT']:
        tokens = tokenize_text(section_text)
        for word in tokens:
            word_id = word_ids.setdefault(word, next_word_id)
            if word_id == next_word_id:
                next_word_id += 1
            word_frequencies[word_id] = word_frequencies.get(word_id, 0) + 1
    article_word_frequencies[article_id] = word_frequencies

# Displaying freq
for article_id, word_frequencies in article_word_frequencies.items():
     print(f"Article ID: {article_id}, Word Frequencies: {word_frequencies}")

#Idf values
total_documents = len(article_word_frequencies)
idf_values = {}

for term, word_id in word_ids.items():
    document_count = sum(1 for word_freqs in article_word_frequencies.values() if word_id in word_freqs)
    idf_values[word_id] = document_count / total_documents

#Display idf
for word_id, idf in idf_values.items():
    print(f"Word ID: {word_id}, IDF: {idf}")

# TF/IDF weights
tfidf_weights = {}

for article_id, word_frequencies in article_word_frequencies.items():
    tfidf_weights[article_id] = {}
    total_words_in_document = sum(word_frequencies.values())
    for word_id, frequency in word_frequencies.items():
        tf = frequency / total_words_in_document
        idf = idf_values.get(word_id, 0)
        tfidf_weights[article_id][word_id] = tf * idf

# print  TF-IDF weights
for article_id, weights in tfidf_weights.items():
    print(f"Article ID: {article_id}")
    for word_id, weight in weights.items():
        if weight != 0:
            print(f"  Word ID: {word_id}, TF-IDF Weight: {weight}")

# Calculating  TF-IDF vectors for each document
unique_word_ids = list(word_ids.values())
tfidf_vectors = {}

for article_id, word_frequencies in article_word_frequencies.items():
    tfidf_vector = np.zeros(len(unique_word_ids))
    for word_id, frequency in word_frequencies.items():
        index = unique_word_ids.index(word_id)
        tfidf_vector[index] = tfidf_weights[article_id].get(word_id, 0)
    tfidf_vectors[article_id] = tfidf_vector

# output TF-IDF vectors
for article_id, tfidf_vector in tfidf_vectors.items():
    print(f"Article ID: {article_id}, TF-IDF Vector: {tfidf_vector}")

# prompt for string input
query = input("Enter your query: ")

#calculating relevance
query_word_frequencies = {}
total_words_in_query = len(query.split())

for word in query.split():
    word_id = word_ids.get(word)
    if word_id is not None:
        query_word_frequencies[word_id] = query_word_frequencies.get(word_id, 0) + 1

query_tfidf_weights = {}
for word_id, frequency in query_word_frequencies.items():
    tf = frequency / total_words_in_query
    idf = idf_values.get(word_id, 0.0)
    query_tfidf_weights[word_id] = tf * idf


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  Word ID: 2643, TF-IDF Weight: 4.3196544276457885e-05
  Word ID: 2644, TF-IDF Weight: 2.1598272138228943e-05
  Word ID: 2645, TF-IDF Weight: 2.1598272138228943e-05
  Word ID: 2646, TF-IDF Weight: 2.1598272138228943e-05
  Word ID: 2647, TF-IDF Weight: 2.1598272138228943e-05
  Word ID: 2648, TF-IDF Weight: 2.1598272138228943e-05
  Word ID: 2649, TF-IDF Weight: 2.1598272138228943e-05
  Word ID: 753, TF-IDF Weight: 0.00017278617710583154
  Word ID: 2650, TF-IDF Weight: 4.3196544276457885e-05
  Word ID: 2651, TF-IDF Weight: 2.1598272138228943e-05
  Word ID: 2652, TF-IDF Weight: 2.1598272138228943e-05
  Word ID: 1165, TF-IDF Weight: 0.00010799136069114472
  Word ID: 2653, TF-IDF Weight: 2.1598272138228943e-05
  Word ID: 2654, TF-IDF Weight: 2.1598272138228943e-05
  Word ID: 2655, TF-IDF Weight: 6.479481641468682e-05
  Word ID: 819, TF-IDF Weight: 6.479481641468682e-05
  Word ID: 1132, TF-IDF Weight: 0.00012958963282937364
  Wo