# Scan for Articles
<p> This script will use NewsAPI to scan for articles based on keywords
<p> Then it will apply the Flair sentiment model and add the results to a dataframe

## Initial Imports

In [2]:
# Initial Imports

import streamlit as st
import os
from newsapi import NewsApiClient
from datetime import date, timedelta
import pandas as pd
from fpdf import FPDF
news_api_key = os.getenv("news_api_key")
import flair
from flair.data import Sentence
import requests
import helium

2022-02-16 20:19:47.704 INFO    numexpr.utils: NumExpr defaulting to 8 threads.


## Function to scan for articles based on keywords

In [3]:
def scan_for_articles(keyword):
    newsapi = NewsApiClient(api_key=news_api_key)
    relevant_articles = newsapi.get_everything(q=keyword,
                                        language='en', 
                                        sort_by='relevancy', 
                                        page_size=100)
    return relevant_articles


In [2]:
# from selenium import webdriver
  
# # Create object
# driver = webdriver.Chrome()
  
# # Assign URL
# url = "https://www.geeksforgeeks.org/"
  
# # New Url
# new_url = ["https://www.wombo.art", "https://www.foxnews.com/media/fox-news-crushes-competition-cnn-draws-smallest-weekly-audience-seven-years"]

# # New Url
# another_new_url = "https://www.wombo.art"
  
# # Opening first url
# driver.get(url)
  
# # Open a new window
# driver.execute_script("window.open('');")
  
# # Switch to the new window and open new URL
# driver.switch_to.window(driver.window_handles[1])
# driver.get(new_url[0])

# driver.execute_script("window.open('');")
# driver.switch_to.window(driver.window_handles[2])
# driver.get(new_url[1])

## Pass in keywords into the scan function and return a dataframe

In [4]:

all_relevant_articles = pd.DataFrame(columns = ["source", "author",	"title", "description",	"url","urlToImage", "publishedAt", "content", "keyword","article_sentiment", "article_confidence"])

keywords = ['Helping', 'Forgiveness', 'Positive']

for word in keywords:
    relevant_articles = scan_for_articles(word)
    df = pd.DataFrame(relevant_articles['articles'])
    df["keyword"] = word
    all_relevant_articles = pd.concat([all_relevant_articles, df],ignore_index=True)

# Clean the data in the source column
#     
all_relevant_articles["source"] = all_relevant_articles["source"].apply(lambda x: x['name'])


# Apply Sentiment model

In [5]:
article_sentiment_model = flair.models.TextClassifier.load('en-sentiment')

# Initialize lists

article_sentiment = []
article_confidence = []


# Run Sentiment analysis on collected news sentences


for sentence in all_relevant_articles["description"]:
        if sentence.strip() == "":
                article_confidence.append("")
                article_sentiment.append("")
                
        else:
                sample = flair.data.Sentence(sentence)
                article_sentiment_model.predict(sample)
                article_sentiment.append(sample.labels[0].value)
                article_confidence.append(sample.labels[0].score)

# Add Results to Dataframe

all_relevant_articles['sentiment'] = article_sentiment
all_relevant_articles['confidence'] = article_confidence

2022-02-16 20:20:05,057 loading file C:\Users\Airma\.flair\models\sentiment-en-mix-distillbert_4.pt


In [7]:
all_relevant_articles.head(1)

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,keyword,article_sentiment,article_confidence,sentiment,confidence
0,Gizmodo.com,Justin Carter,M. Night Shyamalan Thanks Blade Runner 2049 fo...,Guardians of the Galaxy is typically credited ...,https://gizmodo.com/m-night-shyamalan-thanks-b...,https://i.kinja-img.com/gawker-media/image/upl...,2022-01-29T17:45:00Z,Guardians of the Galaxyis typically credited w...,Helping,,,POSITIVE,0.99732
