In [4]:
import feedparser
import time
import sys
import pandas as pd
import re
import urllib
import urllib.request as ur
import argparse
import bs4

class NewsScraper:
    def __init__(self, queries, language="en", locations=[]):
        self.base_url = 'https://news.google.com/rss/search?q='
        self.queries = queries
        self.language = language
        self.locations = locations
        self.d = []

    # Get Alexa Rank - remember it only works from USA so you need a proxy
    def getMetrics(self, url):
        cleanDomain = '/'.join(url.split('/')[:3])
        try:
            alexa_rank = bs4.BeautifulSoup(ur.urlopen("http://data.alexa.com/data?cli=10&dat=s&url="+ url), "xml").find("REACH")["RANK"]
        except:
            alexa_rank = None
        return alexa_rank

    # HTML cleanup function
    def cleanhtml(self, raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    # Access the feed and store data in d
    def readFeed(self, url, query):
        feed = feedparser.parse(url)
        # Loop items in the feed
        for post in feed.entries:
            title = post.title
            link = post.link
            # Converting published date to aaaa/mm/dd
            pubDate = "%d/%02d/%02d" % (post.published_parsed.tm_year,\
                post.published_parsed.tm_mon, \
                post.published_parsed.tm_mday)

            description = self.cleanhtml(post.summary)
            source = post.source.title
            # Get Alexa Rank
            alexa_rank = self.getMetrics(link)
            self.d.append((title, link, pubDate, description, source, query, alexa_rank))
            # print(self.d)
        # Add delay between calls
        time.sleep(2)

    def scrape(self):
        # Looping the different combination of queries and places
        if len(self.locations) > 0:
            # Looping queries and places 
            for a in self.queries:
                for b in self.locations:
                    query = ''.join(map(str, a))
                    # URL encode the query and add quotes around it
                    encoded_query = '"' + urllib.parse.quote_plus(query) + '"'
                    place = urllib.parse.quote_plus(''.join(map(str, b)).upper() + ":" + ''.join(map(str, b)).lower()) 
                    # Compose the URL
                    url = self.base_url + encoded_query + "&hl=" + self.language + "&ceid=" + place 
                    # print("Reading now: ", url)
                    # Read the Feed
                    self.readFeed(url, query)
        else: 
            # Just use the query(ies)
            for a in self.queries:   
                query = ''.join(map(str, a))
                # URL encode the query and add quotes around it
                encoded_query = '"' + urllib.parse.quote_plus(query) + '"'        
                # Compose the URL    
                url = self.base_url + encoded_query
                # print("Reading now: ",url)
                # Read the Feed
                self.readFeed(url, query)

        # Set the file name
        cleanQuery = re.sub('\W+','', query)
        file_name = cleanQuery + ".csv"

        df = pd.DataFrame(self.d, columns=('Title', 'Link', 'pubDate', 'Description','Source', 'Query', 'Alexa Rank'))

                                           # Remove all rows with the same link - you might want to comment this when using different keywords
        df.drop_duplicates(subset ="Link", keep = False, inplace = True)
        return df

        # Store data to CSV
        # df.to_csv(file_name, encoding='utf-8', index=False)
        # print(len(df), "Articles saved on ", file_name)




queries = ['Apple', 'Microsoft']
locations = ['United States']
language = 'en'

scraper = NewsScraper(queries, language, locations)
df = scraper.scrape()

articles = []

for _, row in df.iterrows():
    article = {
        'title': row['Title'],
        'link': row['Link'],
        'pub_date': row['pubDate'],
        'description': row['Description'],
        'source': row['Source'],
        'query': row['Query'],
        'alexa_rank': row['Alexa Rank']
    }
    articles.append(article)

print(articles)
df


[{'title': "Apple to Open First India Stores Next Week. It Has Big Plans in the Country. - Barron's", 'link': 'https://news.google.com/rss/articles/CBMiO2h0dHBzOi8vd3d3LmJhcnJvbnMuY29tL2FydGljbGVzL2FwcGxlLXN0b3JlLWluZGlhLWM5MTk5MWM30gE_aHR0cHM6Ly93d3cuYmFycm9ucy5jb20vYW1wL2FydGljbGVzL2FwcGxlLXN0b3JlLWluZGlhLWM5MTk5MWM3?oc=5', 'pub_date': '2023/04/11', 'description': "Apple to Open First India Stores Next Week. It Has Big Plans in the Country.&nbsp;&nbsp;Barron'sTim Cook to Open First Apple Stores in India in Pivot Beyond China&nbsp;&nbsp;Yahoo FinanceFor Apple, India Is the Next China&nbsp;&nbsp;The Wall Street Journal", 'source': "Barron's", 'query': 'Apple', 'alexa_rank': None}, {'title': 'Global PC shipments slide in first quarter, Apple takes biggest hit, IDC says - Reuters', 'link': 'https://news.google.com/rss/articles/CBMiZ2h0dHBzOi8vd3d3LnJldXRlcnMuY29tL3RlY2hub2xvZ3kvZ2xvYmFsLXBjLXNoaXBtZW50cy1zbGlkZS1xMS1hcHBsZS10YWtlcy1iaWdnZXN0LWhpdC1pZGMtMjAyMy0wNC0xMC_SAQA?oc=5', 'pub_dat

Unnamed: 0,Title,Link,pubDate,Description,Source,Query,Alexa Rank
0,Apple to Open First India Stores Next Week. It...,https://news.google.com/rss/articles/CBMiO2h0d...,2023/04/11,Apple to Open First India Stores Next Week. It...,Barron's,Apple,
1,"Global PC shipments slide in first quarter, Ap...",https://news.google.com/rss/articles/CBMiZ2h0d...,2023/04/10,"Global PC shipments slide in first quarter, Ap...",Reuters,Apple,
2,Apple releases iOS 16.5 beta 2 with these new ...,https://news.google.com/rss/articles/CBMiPGh0d...,2023/04/11,Apple releases iOS 16.5 beta 2 with these new ...,9to5Mac,Apple,
3,Apple Seeds Second Betas of iOS 16.5 and iPadO...,https://news.google.com/rss/articles/CBMiT2h0d...,2023/04/11,Apple Seeds Second Betas of iOS 16.5 and iPadO...,MacRumors,Apple,
4,Apple's simplified beta process is coming to t...,https://news.google.com/rss/articles/CBMiWmh0d...,2023/04/11,Apple's simplified beta process is coming to t...,Macworld,Apple,
...,...,...,...,...,...,...,...
201,Microsoft CEO Satya Nadella’s No. 1 tip for ca...,https://news.google.com/rss/articles/CBMiXWh0d...,2023/03/24,Microsoft CEO Satya Nadella’s No. 1 tip for ca...,CNBC,Microsoft,
202,Microsoft offers EU remedies seeking OK on Act...,https://news.google.com/rss/articles/CBMiaWh0d...,2023/03/17,Microsoft offers EU remedies seeking OK on Act...,Reuters,Microsoft,
203,Activision shares jump as British competition ...,https://news.google.com/rss/articles/CBMid2h0d...,2023/03/24,Activision shares jump as British competition ...,CNBC,Microsoft,
204,Microsoft Edge launches Workspaces test that l...,https://news.google.com/rss/articles/CBMiXGh0d...,2023/04/05,Microsoft Edge launches Workspaces test that l...,The Verge,Microsoft,


In [6]:
import feedparser
import time
import urllib
import argparse
import openai
import asyncio
from functools import partial

openai.api_key = "YOUR_API_KEY"

class NewsItem:
    def __init__(self, title, link, pub_date, description, source, query):
        self.title = title
        self.link = link
        self.pub_date = pub_date
        self.description = description
        self.source = source
        self.query = query
        self.score = None

class NewsScraper:
    def __init__(self, queries, language="en", locations=[]):
        self.base_url = 'https://news.google.com/rss/search?q='
        self.queries = queries
        self.language = language
        self.locations = locations
        self.news_items = []

    # HTML cleanup function
    def cleanhtml(self, raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    # Access the feed and store data in news_items
    async def readFeed(self, url, query):
        feed = feedparser.parse(url)
        # Loop items in the feed
        for post in feed.entries:
            title = post.title
            link = post.link
            # Converting published date to aaaa/mm/dd
            pub_date = "%d/%02d/%02d" % (post.published_parsed.tm_year,\
                post.published_parsed.tm_mon, \
                post.published_parsed.tm_mday)

            description = self.cleanhtml(post.summary)
            source = post.source.title
            news_item = NewsItem(title, link, pub_date, description, source, query)
            score = await score_news_item(news_item)
            news_item.score = score
            self.news_items.append(news_item)
        # Add delay between calls
        time.sleep(2)

    async def scrape(self):
        # Looping the different combination of queries and places
        if len(self.locations) > 0:
            # Looping queries and places 
            for a in self.queries:
                for b in self.locations:
                    query = ''.join(map(str, a))
                    # URL encode the query and add quotes around it
                    encoded_query = '"' + urllib.parse.quote_plus(query) + '"'
                    place = urllib.parse.quote_plus(''.join(map(str, b)).upper() + ":" + ''.join(map(str, b)).lower()) 
                    # Compose the URL
                    url = self.base_url + encoded_query + "&hl=" + self.language + "&ceid=" + place 
                    # print("Reading now: ", url)
                    # Read the Feed
                    await self.readFeed(url, query)
        else: 
            # Just use the query(ies)
            for a in self.queries:   
                query = ''.join(map(str, a))
                # URL encode the query and add quotes around it
                encoded_query = '"' + urllib.parse.quote_plus(query) + '"'        
                # Compose the URL    
                url = self.base_url + encoded_query
                print("Reading now: ",url)
                # Read the Feed
                await self.readFeed(url, query)

        return self.news_items



RuntimeError: This event loop is already running

_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=NameError("name 'partial' is not defined")>
Traceback (most recent call last):
  File "/var/folders/rl/lrqd2lm160ddq99dqj0dvcc00000gn/T/ipykernel_48150/322503537.py", line 39, in score_news_item
    response = await loop.run_in_executor(None, partial(openai.Completion.create,
NameError: name 'partial' is not defined
