In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import string
import nltk
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer

In [18]:
import pandas as pd
import numpy as np
import faiss
import spacy

In [19]:
combined_df = pd.read_csv('combined_df_processed (1).csv')

In [20]:
combined_df['combined_text'] = combined_df['Processed_Title'] + " " + combined_df['Processed_Abstract']

In [21]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Define our text processing functions
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def tokenize_text(text):
    return wordpunct_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

def preprocess_text(text):
    cleaned = clean_text(text)
    tokens = tokenize_text(cleaned)
    tokens = remove_stopwords(tokens)
    lemmatized = lemmatize_tokens(tokens)
    return ' '.join(lemmatized)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Satvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Satvi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Satvi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [22]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(combined_df['combined_text']).toarray().astype(np.float32)

In [23]:
model = SentenceTransformer('all-MiniLM-L6-v2')
if 'bert_embeddings' not in combined_df.columns:
    print("Generating BERT embeddings...")
    combined_df['bert_embeddings'] = model.encode(combined_df['combined_text'].tolist(), show_progress_bar=True).tolist()
embeddings = np.stack(combined_df["bert_embeddings"].values).astype(np.float32)
faiss.normalize_L2(embeddings)
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

Generating BERT embeddings...


Batches: 100%|██████████| 160/160 [01:52<00:00,  1.42it/s]


In [25]:
spacyModel = spacy.load("en_core_web_md")
if 'spacy_embeddings' not in combined_df.columns:
    print("Generating spaCy embeddings...")
    combined_df['spacy_embeddings'] = combined_df['combined_text'].apply(lambda text: spacyModel(text).vector.tolist())
spacy_embeddings = np.stack(combined_df["spacy_embeddings"].values).astype(np.float32)
faiss.normalize_L2(spacy_embeddings)
d_spacy = spacy_embeddings.shape[1]
index_spacy = faiss.IndexFlatL2(d_spacy)
index_spacy.add(spacy_embeddings)

Generating spaCy embeddings...


In [None]:
## user input = query string, preprocess it, vectorize it, and return the top_n most similar articles.
def search_articles(query, top_n=5):
    query_processed = preprocess_text(query)
    query_vector = vectorizer.transform([query_processed])

    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[::-1][:top_n]

    return combined_df.iloc[top_indices]

def search_tfidf(query, top_n=5):
    query_processed = preprocess_text(query)
    query_vector = vectorizer.transform([query_processed]).toarray().astype(np.float32)

    index = faiss.IndexFlatL2(tfidf_matrix.shape[1])
    faiss.normalize_L2(tfidf_matrix)  # Normalize for cosine similarity
    index.add(tfidf_matrix)

    faiss.normalize_L2(query_vector)
    _, top_indices = index.search(query_vector, top_n)

    return combined_df.iloc[top_indices[0]]

def search_bert_cosine(query, top_n=5):
    query_embedding = model.encode(query).reshape(1, -1)
    similarities = cosine_similarity(query_embedding, embeddings).flatten()
    top_indices = similarities.argsort()[::-1][:top_n]

    return combined_df.iloc[top_indices]

def search_bert_faiss(query, top_n=5):
    query_embedding = model.encode([query])
    faiss.normalize_L2(query_embedding)
    _, top_indices = index.search(query_embedding, top_n)

    return combined_df.iloc[top_indices[0]]

def search_spacy_cosine(query, top_n=5):
    query_embedding = spacyModel(query).vector.reshape(1, -1)
    similarities = cosine_similarity(query_embedding, spacy_embeddings).flatten()
    top_indices = similarities.argsort()[::-1][:top_n]
    return combined_df.iloc[top_indices]

def search_spacy_faiss(query, top_n=5):
    query_embedding = spacyModel(query).vector.reshape(1, -1).astype(np.float32)
    faiss.normalize_L2(query_embedding)
    _, top_indices = index_spacy.search(query_embedding, top_n)
    return combined_df.iloc[top_indices[0]]

In [27]:
user_query = "Rising global temperatures affecting ocean"

In [28]:
#top 5 matching articles
cosineResults = search_articles(user_query)
print("\nTF-IDF Results with Cosine Similarity:")
print(cosineResults[['Title', 'Abstract']])


TF-IDF Results with Cosine Similarity:
                                                  Title  \
2491  Coupling Oceanic Observation Systems to Study ...   
2171  Data-driven Global Ocean Modeling for Seasonal...   
2470  Forecasting the effect of heat stress index an...   
2254  A dynamical geography of observed trends in th...   
129   Towards Optimally Weighted Physics-Informed Ne...   

                                               Abstract  
2491  Understanding local currents in the North Atla...  
2171  Accurate ocean dynamics modeling is crucial fo...  
2470  In this paper, we estimate the effect of heat ...  
2254  Revealing the ongoing changes in ocean dynamic...  
129   The carbon pump of the world's ocean plays a v...  


In [29]:
tfidf_results = search_tfidf(user_query)
print("\nTF-IDF Results with FAISS:")
print(tfidf_results[['Title', 'Abstract']])


TF-IDF Results with FAISS:
                                                  Title  \
2491  Coupling Oceanic Observation Systems to Study ...   
2171  Data-driven Global Ocean Modeling for Seasonal...   
2470  Forecasting the effect of heat stress index an...   
2254  A dynamical geography of observed trends in th...   
129   Towards Optimally Weighted Physics-Informed Ne...   

                                               Abstract  
2491  Understanding local currents in the North Atla...  
2171  Accurate ocean dynamics modeling is crucial fo...  
2470  In this paper, we estimate the effect of heat ...  
2254  Revealing the ongoing changes in ocean dynamic...  
129   The carbon pump of the world's ocean plays a v...  


In [30]:
results = search_bert_cosine(user_query)
print("\nBERT Results with Cosine Similarity:")
print(results[['Title', 'Abstract']])


BERT Results with Cosine Similarity:
                                                  Title  \
2254  A dynamical geography of observed trends in th...   
2544  Decadal attribution of historic temperature an...   
192   Sea-level and summer season orbital insolation...   
2491  Coupling Oceanic Observation Systems to Study ...   
2382  Unraveling how winds and surface heat fluxes c...   

                                               Abstract  
2254  Revealing the ongoing changes in ocean dynamic...  
2544  We present an alternative method of calculatin...  
192   The sea-ice cover of the Arctic Ocean is an im...  
2491  Understanding local currents in the North Atla...  
2382  The North Atlantic Ocean circulation, fueled b...  


In [31]:
bert_results = search_bert_faiss(user_query)
print("\nBERT Results with FAISS:")
print(bert_results[['Title', 'Abstract']])


BERT Results with FAISS:
                                                  Title  \
2254  A dynamical geography of observed trends in th...   
2544  Decadal attribution of historic temperature an...   
192   Sea-level and summer season orbital insolation...   
2491  Coupling Oceanic Observation Systems to Study ...   
2382  Unraveling how winds and surface heat fluxes c...   

                                               Abstract  
2254  Revealing the ongoing changes in ocean dynamic...  
2544  We present an alternative method of calculatin...  
192   The sea-ice cover of the Arctic Ocean is an im...  
2491  Understanding local currents in the North Atla...  
2382  The North Atlantic Ocean circulation, fueled b...  


In [32]:
print("\nspaCy Results with Cosine Similarity:")
print(search_spacy_cosine(user_query)[['Title', 'Abstract']])


spaCy Results with Cosine Similarity:
                                                  Title  \
2116  Droughts in Germany -- Why global climate chan...   
3498  Effects of Ozone Levels on Climate Through Ear...   
2177  Carbon cycle instability for high-$\mathrm{CO_...   
46    Model estimates for contribution of natural an...   
2119  Impacts of Climate Change-Induced Salinity Int...   

                                               Abstract  
2116  The warmer temperatures of global climate chan...  
3498  Molecular oxygen in our atmosphere has increas...  
2177  Implicit in the definition of the classical ci...  
46    The contribution of anthropogenic and natural ...  
2119  Changing temperature, precipitation regimes, a...  


In [33]:
print("\nspaCy Results with FAISS:")
print(search_spacy_faiss(user_query)[['Title', 'Abstract']])


spaCy Results with FAISS:
                                                  Title  \
2116  Droughts in Germany -- Why global climate chan...   
3498  Effects of Ozone Levels on Climate Through Ear...   
2177  Carbon cycle instability for high-$\mathrm{CO_...   
46    Model estimates for contribution of natural an...   
2119  Impacts of Climate Change-Induced Salinity Int...   

                                               Abstract  
2116  The warmer temperatures of global climate chan...  
3498  Molecular oxygen in our atmosphere has increas...  
2177  Implicit in the definition of the classical ci...  
46    The contribution of anthropogenic and natural ...  
2119  Changing temperature, precipitation regimes, a...  
