In [1]:
# !pip install langchain==0.0.346
# !pip install newspaper3k==0.2.8
# !pip install pandas==2.1.3
# !pip install deltalake==0.14.0
# !pip install tabulate==0.9.0
# !pip install googlenews
# !pip install gnews==0.3.6
# !pip install plotly==5.18.0
# ! pip install nbformat==5.9.2
# !pip install openai==1.3.7
# !pip install pyaml==23.9.7
# !pip show 

In [2]:
import sys
sys.path.append('..')

import os
import json
import pandas as pd

from tabulate import tabulate
from IPython.display import display, Markdown
# TEST

from datetime import datetime, timedelta
import pytz

from deltalake import DeltaTable
from deltalake.writer import write_deltalake

from openai import OpenAI

import yaml

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [3]:
with open('../config.yml', 'r') as file:
    config = yaml.safe_load(file)

query = config['query']
days_lookback = config['days_lookback']
max_results = config['max_results']

config

{'query': 'Eli Lilly Ozempic Risks', 'days_lookback': 90, 'max_results': 1000}

In [4]:
# Params
table_path_in = '../data/articles-ingest'
table_path_out = '../data/articles-scored'
# --------------------------------------------

# query = "Ozempic"
# days_lookback = 30

current_datetime = datetime.now().astimezone(pytz.utc)
current_date = current_datetime.date()
start_date = (current_datetime - timedelta(days=days_lookback)).date()

# --------------------------------------------

start_date_str = str(start_date.strftime("%Y-%m-%d"))
current_date_str = str(current_date.strftime("%Y-%m-%d"))

print(
    f"""
    end_date: {current_date}
    start_date: {start_date}
    
    
    days_lookback: {days_lookback}
    """
)



    end_date: 2023-12-13
    start_date: 2023-09-14
    
    
    days_lookback: 90
    


In [5]:
# Query queries that have been ingested
DeltaTable(table_path_in).to_pandas(columns=['query']).value_counts()

query                  
Eli Lilly Ozempic Risks    77
Name: count, dtype: int64

In [6]:
df = DeltaTable(table_path_in).to_pandas(
    filters=[
        ('query','=',query),
        ('published date str','>', start_date_str),
    ]
)

df = df.loc[df['article text'].notna()] # can remove articles where scraping failed

df.head(1)

Unnamed: 0,title,description,published date,url,publisher,query,query end date,published datetime,published date str,article text,article html,article summary,article keywords
0,Why weight-loss drug Wegovy shows promise for ...,Why weight-loss drug Wegovy shows promise for ...,2023-11-11,https://news.google.com/rss/articles/CBMibGh0d...,"{'href': 'https://www.nbcnews.com', 'title': '...",Eli Lilly Ozempic Risks,2023-12-13 02:07:22.818228,2023-11-11 08:00:00,2023-11-11,The weight-loss drug Wegovy can reduce the ris...,"<!DOCTYPE html><html lang=""en""><head><link hre...",The weight-loss drug Wegovy can reduce the ris...,"[drugs, disease, drug, risk, weight, inflammat..."


## process for prompting openai with article info, with traceability

- Each row in the table is an article. Functions can be used to iterate through the table and add additional column.
- For each prompt, we want to retain the prompt in the table, as well as the message history, and response.


In [7]:
client = OpenAI()

def print_df(df):
    return tabulate(df, headers='keys', tablefmt='html')

def generate_openai_response_json(prompt, model="gpt-3.5-turbo-1106"):
    """
    Generic function to generate a GPT JSON response
    """
    response = client.chat.completions.create(
    model=model,
    response_format={ "type": "json_object" },
    messages=[
        {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
        {"role": "user", "content": prompt}
    ]
    )
    json_response = response.choices[0].message.content
    return json.loads(json_response)


def generate_openai_response(prompt, model="gpt-3.5-turbo-1106"):
    """
    Generic function to generate a GPT STRING response
    """
    response = client.chat.completions.create(
    model=model,
        messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    )
    return response.choices[0].text



def generate_sentiment_prompt(query, title, article_text, published_date_str):
    prompt = f"""
    ## Task
    - Your task is to read the article below to perform a sentiment analysis IN REGARDING TO THE TOPIC: {query}
    - Provide a sentiment negativity score from 1-10, 10 being very negative.
    - Provide a sentiment explanation, reasoning why the score was given.
    - Provide specific negative claims made by the article, as short bullet points.


    ## Inputs
    - Search term used to find this article: {query}

    - The article:
        - Title: {title}
        - Published Date: {published_date_str}
        - Article: \n{article_text}

    ## Outputs
    A JSON object with the following: 
        - 'sentiment_score': (int in string format)
        - 'sentiment_explanation': (string)
        - 'negative_claims': (string with bullet points delimited by new lines)
    """
    return prompt


def generate_sentiment_response(prompt):
    response = generate_openai_response_json(prompt)
    
    return response['sentiment_score'], response['sentiment_explanation'], response['negative_claims']


def apply_sentiment_response(df, column_names = ['sentiment_score', 'sentiment_explanation', 'negative_claims']):

    df_copy = df.copy()

    new_columns = df_copy.apply(lambda row: generate_sentiment_response(row['sentiment_prompt']), axis=1 , result_type='expand')
    new_columns.columns = column_names

    for col in new_columns.columns:
        df_copy.loc[:, col] = new_columns[col].copy()

    return df_copy



In [8]:
i=0
article_text = df.iloc[i]['article text']
published_date_str = df.iloc[i]['published date str']
title = df.iloc[i]['title']
query = df.iloc[i]['query']

sentiment_prompt = generate_sentiment_prompt(query, title, article_text, published_date_str)
display(Markdown(sentiment_prompt))


    - Your task is to read the article below to perform a sentiment analysis IN REGARDING TO THE TOPIC: Eli Lilly Ozempic Risks
    - Provide a sentiment negativity score from 1-10, 10 being very negative.
    - Provide a sentiment explanation, reasoning why the score was given.
    - Provide specific negative claims made by the article, as short bullet points.


    ## Inputs
    - Search term used to find this article: Eli Lilly Ozempic Risks

    - The article:
        - Title: Why weight-loss drug Wegovy shows promise for heart disease - NBC News
        - Published Date: 2023-11-11
        - Article: 
The weight-loss drug Wegovy can reduce the risk of heart attacks and strokes by 20% in adults with heart disease and obesity, according to new research.

The trial is the first to show that a weight-loss medication can help protect against serious heart problems in obese adults, potentially opening the door to a new treatment option for heart disease, the leading cause of death in the United States.

The findings could also have important implications for insurance coverage of the drugs. Currently, insurance companies are reluctant to provide coverage for weight-loss drugs, and Medicare, by law, is barred from covering them.

“I think this will force insurance and Medicare to consider reimbursement,” said Dr. Eric Topol, a cardiologist and executive vice president of Scripps Research in California.

The results were presented on Saturday at the 2023 American Heart Association Scientific Sessions in Philadelphia and were simultaneously published in the New England Journal of Medicine.

“This is a pretty big deal,” said Dr. Michael Lincoff, the study’s lead author and a cardiologist at the Cleveland Clinic who presented the findings. “Just being overweight or having obesity is a risk factor for increased risk of heart disease, and yet we’ve had no treatment that addressed that specific risk factor.”

The trial included more than 17,600 adults ages 45 and up with heart disease who got either a weekly 2.4-milligram dose of semaglutide, which is the drug used in Wegovy, or a placebo. About three-quarters of the participants were men.

All of the study participants were either overweight or obese but did not have diabetes. Patients who had a heart attack or stroke within the last 60 days were not allowed to enroll in the trial.

After about three years of follow-up, there were cardiovascular events — such as heart attack, stroke or death linked to heart disease — in 569 participants in the Wegovy group, compared to 701 participants in the placebo group. That corresponds to a 20% lower risk in people who took Wegovy.

“That is significant,” said Dr. Amit Khera, director of the preventive cardiology program at UT Southwestern Medical Center in Texas. Khera co-authored an editorial that was published Saturday alongside the study in the New England Journal of Medicine.

A new treatment option like Wegovy would be a big deal for people with heart disease, Khera said. One of the risk factors is being overweight or obese, which affects about 70% of the U.S. population.

“We finally have some treatments that are not only very effective for weight loss, but also seem to be very beneficial for heart disease,” he said.

The findings are even more significant because older weight-loss medications, like phentermine or orlistat, have not been shown to protect against serious heart problems, said Dr. Susan Spratt, an endocrinologist and the senior medical director for the Population Health Management Office at Duke Health in North Carolina. One weight-loss drug, fen-phen, was taken off the U.S. market in the late 1990s because it was linked to serious heart problems.

“This is amazing evidence that semaglutide not only helps people with diabetes prevent devastating complications but also helps patients with obesity lose weight and avoid major adverse heart consequences,” said Spratt. (Semaglutide is also approved as a diabetes drug under the name Ozempic.)

Dr. Nishant Shah, a preventive cardiologist at Duke University Medical Center in Durham, North Carolina, said that researchers still don’t know exactly how the medication prevents serious heart problems in overweight patients.

It could be the result of the weight loss itself or some other mechanism in the drug, Shah said.

“That question is under investigation right now,” he said.

Lincoff, the study author, said that inflammation could potentially play a role. Inflammation can cause plaques in the arteries to rupture and form blood clots, leading to heart attacks and strokes.

In the trial, Lincoff said, patients taking the drug saw reductions in inflammation that were similar to the reductions seen in patients on statins.

That is “pretty sizable,” he said.

Additional research also presented Saturday at the American Heart Association conference supports that idea.

Researchers at Mount Sinai Health System in New York City looked at how much another weight loss drug, tirzepatide, reduced inflammation in patients with Type 2 diabetes and who have heart disease. Tirzepatide, which is similar to semaglutide, is the ingredient found in Eli Lilly’s diabetes drug Mounjaro and the newly approved weight loss drug Zepbound.

In the study, the researchers measured levels of C-reactive protein, or CRP, a protein the liver produces in response to inflammation.

The research included nearly 2,000 people, half of whom were taking tirzepatide and half of whom were taking insulin. Researchers looked at CRP levels in the participants’ blood taken at three points in the study — the beginning, at 24 weeks and at 52 weeks. By the end of the study, CRP levels were 40% to 50% lower in the tirzepatide group.

“That’s an enormous reduction in inflammation,” said study author Dr. Deepak Bhatt, a cardiologist at Mount Sinai who presented the findings. Bhatt, who received funding for the study from Lilly, said a randomized clinical trial is still needed to confirm the heart benefits. “One would hope that would translate into reductions in cardiovascular risk,” he added.

Topol, of Scripps Research, said newer weight loss drugs have the potential to become new treatments for heart disease.

Although the Wegovy results are promising, Topol said that additional research is still needed on how well the drug lowers cardiovascular risk in patients with obesity who do not have heart disease.

The Wegovy trial “was a very select group of patients because they all had heart disease,” he said. “We need a lot more work to find out if people who aren’t so high-risk would benefit.”

Lincoff said additional analyses will be run over the next year, including looking at subgroups of patients, to understand exactly how the drug prevents heart problems and who could benefit.

“How much was through weight loss? How much was through the inflammation? How much was through the sugar control?” he said.

Obesity is a public health issue, Khera said, and more work is needed to address issues around equity.

“We also have to appreciate that these drugs are really expensive, and a lot of people will not be able to access them,” he said. “So as much as we’re looking at the drug, we still need to look upstream as to why we have obesity and downstream about people who can’t afford them.”

    ## Outputs
    A JSON object with the following: 
        - 'sentiment_score': (int in string format)
        - 'sentiment_explanation': (string)
        - 'negative_claims': (string with bullet points delimited by new lines)
    

In [9]:
sentiment_score, sentiment_explanation, negative_claims = generate_sentiment_response(sentiment_prompt)

output = f""" 
Sentiment score: {sentiment_score}
Sentiment explanation: 
    \n{sentiment_explanation}
Negative claims: 
    \n{negative_claims}
"""
display(Markdown(output))

 
Sentiment score: 3
Sentiment explanation: 
    
The sentiment score is relatively low because the article mainly focuses on the potential positive impact of the weight-loss drug Wegovy in reducing the risk of heart attacks and strokes among obese adults with heart disease. While there are mentions of the cost implications and the need for further research, the overall tone of the article is more positive than negative.
Negative claims: 
    
- Medicare is currently barred from covering weight-loss drugs
- Lack of clarity on how the medication prevents serious heart problems in overweight patients
- Expensive nature of newer weight loss drugs and concerns about accessibility


In [10]:
df['sentiment_prompt'] = df.apply(lambda row: generate_sentiment_prompt(row['query'], row['title'], row['article text'], row['published date str']), axis=1)

In [11]:
df = apply_sentiment_response(df)


In [12]:
df = df.sort_values(by=['sentiment_score'], ascending=False)
df.head(2)

Unnamed: 0,title,description,published date,url,publisher,query,query end date,published datetime,published date str,article text,article html,article summary,article keywords,sentiment_prompt,sentiment_score,sentiment_explanation,negative_claims
76,Intestinal Blockage Added as Potential Ozempic...,Intestinal Blockage Added as Potential Ozempic...,2023-10-04,https://news.google.com/rss/articles/CBMiUmh0d...,"{'href': 'https://www.drugwatch.com', 'title':...",Eli Lilly Ozempic Risks,2023-12-13 02:07:22.818228,2023-10-04 07:00:00,2023-10-04,The U.S. Food and Drug Administration has adde...,<!DOCTYPE html>\n<!--[if lt IE 8]> <html ...,Other side effects of Ozempic already listed o...,"[blockage, effects, drug, fda, intestinal, weg...",\n - Your task is to read the article below...,9,The sentiment score is high due to the numerou...,- Intestinal blockage (ileus) added as a poten...
25,"Ozempic, Wegovy linked to severe medical condi...","Ozempic, Wegovy linked to severe medical condi...",2023-10-05,https://news.google.com/rss/articles/CBMifGh0d...,"{'href': 'https://www.nbcnews.com', 'title': '...",Eli Lilly Ozempic Risks,2023-12-13 02:07:22.818228,2023-10-05 07:00:00,2023-10-05,"People who take popular drugs for weight loss,...","<!DOCTYPE html><html lang=""en""><head><link hre...","People who take popular drugs for weight loss,...","[drugs, drug, risk, weight, finds, medications...",\n - Your task is to read the article below...,9,The sentiment score is given a 9 due to the si...,- Research published in the Journal of the Ame...


In [13]:
write_deltalake(
    table_path_out,
    df,
    partition_by=['query', 'published date str'],
    mode='overwrite',
    overwrite_schema=True,
    partition_filters=[
        ('query', '=', query),
        ('published date str', '>=', start_date_str), 
        ('published date str', '<=', current_date_str),
    ]
)