In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install gradio
!pip install matplotlib umap-learn
!pip install cupy-cuda11x

In [None]:
# General Imports: Basic Python libraries for system interaction and text processing
import os  # For interacting with the operating system (file handling, environment variables)
import re  # For working with regular expressions (pattern matching in strings)
from collections import Counter  # For counting occurrences of elements (e.g., word frequencies)

# Data Processing & Analysis: Libraries for handling and processing structured data
import pandas as pd  # Data manipulation and analysis with DataFrames
import numpy as np  # Numerical operations, array handling, and matrix math
from google.cloud import bigquery  # For interacting with Google BigQuery (cloud data warehouse)

# Machine Learning / NLP: Libraries for text analysis, machine learning
from sklearn.feature_extraction.text import CountVectorizer  # For converting text into a matrix of token counts (Bag of Words)
from sklearn.metrics.pairwise import cosine_similarity  # For measuring similarity between vectors (cosine similarity)
import torch  # Deep learning framework (PyTorch), for building and training neural networks
from sentence_transformers import SentenceTransformer  # For sentence embeddings using pre-trained transformer models

# Interface / Web: Libraries for building interactive web interfaces for machine learning models
import gradio as gr  # For creating interactive applications and demos for machine learning models

# Progress Bar: Library to display a progress bar for loops or long-running tasks
from tqdm import tqdm  # For creating a progress bar in loops (visualizing the progress of time-consuming tasks)

# Data Visualization & Dimensionality Reduction: Libraries for visualizing and reducing the dimensionality of data
import seaborn as sns  # For creating attractive and informative statistical graphics
from matplotlib.lines import Line2D  # For custom line creation in plots
import matplotlib.pyplot as plt  # For creating static, animated, and interactive plots
from sklearn.manifold import TSNE  # For t-SNE dimensionality reduction technique
import umap  # For Uniform Manifold Approximation and Projection (UMAP)
import cupy as cp  # For GPU-accelerated computation (NumPy-like operations with CUDA support)

# Image Processing: Libraries for handling image data in memory and processing images
from io import BytesIO  # For handling image byte streams
from PIL import Image  # For image manipulation and processing



#Part 1


functions

In [None]:
def clean_string(text):
    '''clean the string
    '''
    # Remove punctuation and special characters (keep only letters and spaces)
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Normalize spaces (replace multiple spaces with a single space)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    # Strip leading and trailing spaces
    cleaned_text = cleaned_text.strip()

    # Convert to lowercase
    cleaned_text = cleaned_text.lower()

    return cleaned_text

def clean_n_org(query_job):
    '''organize the query into dataframe and appyle clean_string
       to Context column
    '''
    # Get query result as DataFrame
    data = query_job.result().to_dataframe()

    data['Context'] = data['Context'].fillna('').astype(str)

    # Convert 'DateTime' column to datetime and extract date part
    data['DateTime'] = pd.to_datetime(data['DateTime'], errors='raise')
    data['DateTime'] = data['DateTime'].dt.date

    # Remove duplicates based on 'Context' column
    data.drop_duplicates(subset=['Context'], inplace=True)

    # Clean the 'Context' column
    data['Context'] = data['Context'].apply(clean_string)
    idx = data['Context'] == ''
    data = data.drop(data[idx].index)

    # Return the cleaned data
    monthly_data = data
    return monthly_data

def word_dist_bytopic(df):
    ''' create a count of the words by topic
    '''
    #Group by topic and context again making sure column type is set

    grouped = df.groupby('Topic')['Context'].apply(lambda x: ' '.join(x.astype(str))).reset_index()

    #tokenizes the text and counts the frequency of each word, ignoring words like "is", "the", "and", etc
    vectorizer = CountVectorizer(stop_words='english')

    word_distributions = {}

    for index, row in grouped.iterrows():
        topic = row['Topic']
        text = row['Context']

        # Vectorizing the text and getting word counts
        word_counts = vectorizer.fit_transform([text]).toarray().sum(axis=0)

        # Get the words corresponding to the vectorized text
        words = vectorizer.get_feature_names_out()

        # Map words to their counts and store in word_distributions
        word_distributions[topic] = dict(zip(words, word_counts))

    return word_distributions

def top_words_bytopic(word_distributions, num):
  '''Prints the top `num` most frequent words for each topic.
  '''
  for topic, word_count in word_distributions.items():
    print(f"Topic: {topic}")

    # print the top `num` most common words for the topic using Counter
    print(Counter(word_count).most_common(num))

    print('\n')

def top_words_bytopic(word_distributions, num):
    '''Prints the top `num` most frequent words for each topic and returns a DataFrame.'''

    top_words = []

    # Loop through each topic and its associated word counts
    for topic, word_count in word_distributions.items():
        print(f"Topic: {topic}")

        # Get the top `num` most common words for the topic
        most_common_words = Counter(word_count).most_common(num)

        # Print the top words for this topic
        for word, count in most_common_words:
            print(f"{word}: {count}")

        # Add the results to the list for DataFrame
        for word, count in most_common_words:
            top_words.append({
                'Topic': topic,
                'Word': word,
                'Frequency': count
            })

        print('\n')

    # Convert the results to a DataFrame
    top_words_df = pd.DataFrame(top_words)

    return top_words_df

Call the API

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "/content/drive/MyDrive/py_project/BigQuerykeyfile.json"

client = bigquery.Client()

Sample query and result

In [None]:

#gdelt_query =
"""
 SELECT Topic, COUNT(*)
 FROM `gdelt-bq.covid19.onlinenews`
 WHERE TIMESTAMP_TRUNC(DateTime, DAY) BETWEEN TIMESTAMP("2020-02-01") AND TIMESTAMP("2020-02-02")
 Group BY Topic
 """

# query_job = client.query(gdelt_query)

# print(query_job)
# for row in query_job.result():
#   print(row[0], row[1])

#output
'''
Falsehoods 912
Cases 11258
Masks 4570
Panic 916
Quarantine 8259
Testing 6293
Covid19 20738
Prices 624
Shortages 196
Ventilators 13
SocialDistancing 2
'''

In [None]:
# x = sum(df['f0_'])
# df = query_job.result().to_dataframe()
# print(df, x)

Query GDELT covid19 table for monthly data

In [None]:
#2020 Month Collected as_of_today[feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec]
#2021 Month Collected as_of_today[jan]

testq = '''
SELECT *
FROM `gdelt-bq.covid19.onlinenews`
WHERE TIMESTAMP_TRUNC(DateTime, DAY) BETWEEN TIMESTAMP("2021-01-01") AND TIMESTAMP("2021-01-31")
    AND REGEXP_CONTAINS(LOWER(Context), r'vaccine')
'''
query_job = client.query(testq)


In [None]:
query_job.result()

Clean and organize the monthly query then
Collect output and store as CSV file

In [None]:
output = clean_n_org(query_job).dropna()
output.to_csv('jan_data.csv', index=False)  # Don't save the index by default

Upload CSV file

In [None]:
upload = pd.read_csv('jan_data.csv').dropna() # Don't save the index by default


Find word distribution by topic and list top most common words

In [None]:
word_freq = word_dist_bytopic(upload)

In [None]:
top_count = top_words_bytopic(word_freq, 10)

#Part 2


Collect monthly data from 2020-02 : 2021-01 and combine to one df

In [None]:
'''
import glob

# Step 1: Use glob to find all CSV files in the directory
csv_files = glob.glob('/content/*.csv')

# Step 2: Read each CSV file into a list of DataFrames
dfs = [pd.read_csv(file) for file in csv_files]

# Step 3: Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Step 4: Drop Date column
combined_df = combined_df.drop(columns=['DateTime'])

# Optionally, you can reset the index after concatenation:
combined_df.reset_index(drop=True, inplace=True)
'''

Collect full dataset and output CSV

In [None]:
'''
combo_output = combined_df.dropna()
combo_output.to_csv('one_year_data.csv', index=False)
'''

Upload full dataset CSV file

In [None]:
combo_upload = pd.read_csv('/content/drive/MyDrive/py_project/one_year_data.csv').dropna()

Find word distribution by topic and list top most common words

In [None]:
full_word_freq = word_dist_bytopic(combo_upload)

In [None]:
full_top_count = top_words_bytopic(full_word_freq, 10)

The context and topic columns are populated based on the textual analysis of the article's content. GDELT uses natural language processing (NLP) algorithms to examine the text of the article, identifying key themes, entities, and relationships between them. The context refers to the broader or underlying themes of the article, and these are typically categorized into specific topics related to COVID-19.

Filter the yearly data by keywords

In [None]:
vaccine_effectiveness_keywords = [
    "efficacy",
    "effectiveness",
    "vaccine effectiveness",
    "vaccine efficacy",
    "protection",
    "protective",
    "protection level",
    "protection rate",
    "protection efficacy",
    "vaccine response",
    "booster effect",
    "prevention",
    "immunity",
    "long-term immunity",
    "population immunity",
    "herd immunity",
    "viral load reduction",
    "clinical trials",
    "trial results",
    "antibody response",
    "immune response",
    "cross-protection",
    "breakthrough cases",
    "vaccine failure rate",
    'pfizer',
    'mRNA',
    'moderna',
    'novavax',
    'johnson'
]

l = ["efficacy", 'effectiveness', 'immunity']

#Creates a regular expression pattern to match any of the words in the given list `l`. The pattern matches whole words, ensuring that matches occur only at word boundaries.
pattern = r'\b(?:' + '|'.join(map(re.escape, l)) + r')\b'

# Filter rows that contain any word from the list
filtered_df = combo_upload[combo_upload['Context'].str.contains(pattern, regex=True, na=False)]

#filtered DataFrame to csv
#filtered_df.to_csv('filtered_data.csv', index=False)

Load filtered csv

In [None]:
filtered_df  = pd.read_csv('/content/drive/MyDrive/py_project/filtered_data.csv').dropna()


Find the count of words in the Context colum and find mean

In [None]:
# Count the number of words in each row of the 'Context' column
filtered_df['word_count'] = filtered_df['Context'].apply(lambda x: len(str(x).split()))

# Calculate the mean number of words across all rows in the 'Context' column
mean_word_count = filtered_df['word_count'].mean()

# Display the word counts and mean
print(filtered_df[['Context', 'word_count']])
print(f"Mean number of words: {mean_word_count}")


Preparing to create embedding for Context column

Load embedding model

In [None]:
# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

Given size of data embeddings are done in batches,
and Processing is done using google T4-GPU

In [None]:

print(torch.cuda.is_available())  # Should return True if a GPU is available

In [None]:
#DO ONCE

batch_size = 64  # Adjust the batch size based on GPU memory
embeddings = []

for i in tqdm(range(0, len(filtered_df), batch_size)):
    batch = filtered_df['Context'].iloc[i:i + batch_size]
    embeddings_batch = model.encode(batch.tolist(), batch_size=batch_size, show_progress_bar=True, device='cuda')
    embeddings.extend(embeddings_batch)

# Now, `embeddings` holds the result for all the sentences

Add embedding to filtered_data

In [None]:
filtered_df['embeddings'] = embeddings
#write file to drive
#filtered_df.to_csv('/content/drive/MyDrive/py_project/embedded_filtered_data', index=False)

Load embedded_data

In [None]:
embedded_data  = pd.read_csv('/content/drive/MyDrive/py_project/embedded_filtered_data.csv').dropna()
#format embeddings to correct type since I saved as csv
embedded_data['embeddings'] = embedded_data['embeddings'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' '))

In [None]:
embedded_data

Function the searches context via embedded query

In [None]:
def semantic_search(query, top_k=100):
    # Load the model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate embedding for the query
    query_embedding = model.encode(query)

    # Compute cosine similarity between query and all documents
    similarities = cosine_similarity([query_embedding], embedded_data['embeddings'].tolist())

    # Get indices of the top K most similar documents
    top_k_indices = similarities[0].argsort()[-top_k:][::-1]

    # Prepare the results as a list of dictionaries, including Topic, URL, Context, and Similarity Score
    plt_index = []
    matches = []
    for i in top_k_indices:
        topic = embedded_data['Topic'].iloc[i]  # Assuming 'Topic' column exists
        url = embedded_data['URL'].iloc[i]      # Assuming 'URL' column exists
        context = embedded_data['Context'].iloc[i]  # Assuming 'Context' column exists
        similarity_score = similarities[0][i]
        index = i

        plt_index.append(int(index))

        matches.append({

            "Topic": topic,
            "URL": url,
            "Context": context,
            "Similarity Score": similarity_score
        })

    return matches, plt_index

In [None]:

x = [i for i in range(0,len(embedded_data))]

In [None]:
semantic_search("academics talking about vaccine")

#visualizing context by topics

In [None]:
#run once
umap_model = umap.UMAP(n_neighbors=15, n_components=2, metric='cosine', random_state=42)  # UMAP for 2D visualization
reduced_embeddings = umap_model.fit_transform(embedded_data['embeddings'].tolist())



Create dataFrame from reduced embeddings

In [None]:
reduced_df = pd.DataFrame(reduced_embeddings, columns=['UMAP_Component_1', 'UMAP_Component_2'])
reduced_df['Topic'] = embedded_data['Topic']
reduced_df['URL'] = embedded_data['URL']
reduced_df['Context'] = embedded_data['Context']
reduced_df
reduced_df.to_csv('/content/drive/MyDrive/py_project/reduced_embedding_data.csv', index=False)

Read reduced_embedding_data

In [None]:
reduced_df = pd.read_csv('/content/drive/MyDrive/py_project/reduced_embedding_data.csv')


Plot the cluster of topics

In [None]:
def plot_topics_for_indexes(indexes):
    # Filter the DataFrame to include only the rows specified by the indexes
    filtered_df = reduced_df.iloc[indexes]

    # Get unique topics from the filtered DataFrame
    unique_topics = filtered_df['Topic'].unique()

    # Create a color palette for each unique topic
    palette = sns.color_palette("Set2", len(unique_topics))  # "Set2" is just an example, feel free to change

    # Map each topic to a unique color
    topic_color_map = {topic: palette[i] for i, topic in enumerate(unique_topics)}

    # Create the scatter plot with colors based on the 'Topic' column
    plt.figure(figsize=(10, 8))

    # Plot each category separately with corresponding color
    for topic, color in topic_color_map.items():
        topic_data = filtered_df[filtered_df['Topic'] == topic]
        plt.scatter(
            topic_data['UMAP_Component_1'],
            topic_data['UMAP_Component_2'],
            color=color,  # Assign the correct color based on the topic
            label=topic,  # Use the topic name as label for the legend
            alpha=0.6
        )

    # Add the legend to the plot
    plt.legend(labels=unique_topics, title="Topic")

    # Customize the plot's title and axis labels
    plt.title("Visualization of Embeddings")
    plt.xlabel("UMAP Component 1")
    plt.ylabel("UMAP Component 2")

    # Save the plot to a BytesIO buffer
    buf = BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)  # Rewind the buffer to the beginning
    plt.close()  # Close the plot to avoid memory issues

    # Return the image as a PIL object
    pil_image = Image.open(buf)
    return pil_image

#save_path = '/content/drive/MyDrive/py_project/umap_plot.png'
#plt.savefig(save_path)

In [None]:
plot_topics_for_indexes(x)

#Search interface


interface function to interact with search function

In [None]:
def gradio_interface(query):
    matches, plt_index = semantic_search(query)

    result_str = ""
    for rec in matches:
        result_str += f"<strong>Topic:</strong> {rec['Topic']}<br>"
        result_str += f"<strong>URL:</strong> <a href='{rec['URL']}' target='_blank'>{rec['URL']}</a><br>"
        result_str += f"<strong>Context:</strong> {rec['Context']}<br>"
        result_str += f"<strong>Similarity Score:</strong> {rec['Similarity Score']:.4f}<br>"
        result_str += "<hr>"  # Horizontal rule for separation between entries

    plot_image = plot_topics_for_indexes(plt_index)

    return plot_image, result_str


In [None]:
gradio_interface('academics talking about vaccine')

Define and launch the inteface for use

In [None]:
# Define the Gradio interface
iface = gr.Interface(
    fn=gradio_interface,                     # Function to run on input
    inputs=gr.Textbox(label="Enter search text"),  # Input: Textbox for query text
    outputs=[gr.Image(label="Plot Image"), gr.HTML(label="Related Articles")] # Output: Textbox for recommendations
)

# Launch the Gradio interface
iface.launch()