In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import string
import nltk
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
import pandas as pd

In [3]:
combined_df = pd.read_csv('combined_df_processed.csv')

In [4]:
combined_df['combined_text'] = combined_df['Processed_Title'] + " " + combined_df['Processed_Abstract']

In [5]:
# Build a TF-IDF vectorizer on the combined text
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(combined_df['combined_text'])

In [6]:
tfidf_matrix

<5120x35924 sparse matrix of type '<class 'numpy.float64'>'
	with 399518 stored elements in Compressed Sparse Row format>

In [8]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Define our text processing functions
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def tokenize_text(text):
    return wordpunct_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

def preprocess_text(text):
    cleaned = clean_text(text)
    tokens = tokenize_text(cleaned)
    tokens = remove_stopwords(tokens)
    lemmatized = lemmatize_tokens(tokens)
    return ' '.join(lemmatized)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [9]:
## user input = query string, preprocess it, vectorize it, and return the top_n most similar articles.
def search_articles(query, vectorizer, tfidf_matrix, df, top_n=5):

    # Preprocess the query using the same pipeline
    query_processed = preprocess_text(query)
    query_vector = vectorizer.transform([query_processed])

    #cosine similarities between the query and all articles
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    top_indices = similarities.argsort()[::-1][:top_n]
    return df.iloc[top_indices]

#Example:
if __name__ == "__main__":
    user_query = "Rising global temperatures affecting ocean"

    #top 5 matching articles
    results = search_articles(user_query, vectorizer, tfidf_matrix, combined_df, top_n=5)
    print(results[['Title', 'Abstract']])

                                                  Title  \
2491  Coupling Oceanic Observation Systems to Study ...   
2171  Data-driven Global Ocean Modeling for Seasonal...   
2470  Forecasting the effect of heat stress index an...   
2254  A dynamical geography of observed trends in th...   
129   Towards Optimally Weighted Physics-Informed Ne...   

                                               Abstract  
2491  Understanding local currents in the North Atla...  
2171  Accurate ocean dynamics modeling is crucial fo...  
2470  In this paper, we estimate the effect of heat ...  
2254  Revealing the ongoing changes in ocean dynamic...  
129   The carbon pump of the world's ocean plays a v...  
