In [56]:
import feedparser
import time
import sys
import pandas as pd
import re
import urllib
import urllib.request as ur
import argparse
import bs4
import openai

import httpx
import asyncio
from functools import partial
import openai
openai.api_key = 'sk-x'
# create a new string that is the concatenation of the two strings
prompt = "The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.\n\nHuman: Hello, who are you?\nAI: I am an AI created by OpenAI. How can I help you today?\nHuman: " + " ".join([f" {i}" for i in range(10)])
async def generate_image(prompt):
    loop = asyncio.get_event_loop()
    response = await loop.run_in_executor(None, partial(openai.Image.create, prompt="Black and white minimalistic image of prompt:" + prompt, n=1, size="256x256"))
    data = response
    if "data" not in data:
        print(f"Error: {data}")
        return ""
    return data["data"][0]["url"]

async def generate_text(prompt):
    loop = asyncio.get_event_loop()
    # concatenate prompt to 10 words
    prompt = prompt + " " + " ".join([f" {i}" for i in range(10)])
    response = await loop.run_in_executor(None, partial(openai.ChatCompletion.create, model="gpt-3.5-turbo", messages=[{"role": "user", "content":prompt}], max_tokens=50, n=1, stop=None, temperature=0.5))
    data = response
    if "choices" not in data:
        print(f"Error: {data}")
        return ""
    return data["choices"][0]["message"]["content"].strip()


class NewsScraper:
    def __init__(self, queries, language="en", locations=[]):
        self.base_url = 'https://news.google.com/rss/search?q='
        self.queries = queries
        self.language = language
        self.locations = locations
        self.d = []

    # Get Alexa Rank - remember it only works from USA so you need a proxy
    def getMetrics(self, url):
        cleanDomain = '/'.join(url.split('/')[:3])
        try:
            alexa_rank = bs4.BeautifulSoup(ur.urlopen("http://data.alexa.com/data?cli=10&dat=s&url="+ url), "xml").find("REACH")["RANK"]
        except:
            alexa_rank = None
        return alexa_rank

    # HTML cleanup function
    def cleanhtml(self, raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    # Access the feed and store data in d
    def readFeed(self, url, query):
        feed = feedparser.parse(url)
        # Loop items in the feed
        for post in feed.entries:
            title = post.title
            link = post.link
            # Converting published date to aaaa/mm/dd
            pubDate = "%d/%02d/%02d" % (post.published_parsed.tm_year,\
                post.published_parsed.tm_mon, \
                post.published_parsed.tm_mday)

            description = self.cleanhtml(post.summary)
            source = post.source.title
            # Get Alexa Rank
            alexa_rank = self.getMetrics(link)
            self.d.append((title, link, pubDate, description, source, query, alexa_rank))
            # print(self.d)
        # Add delay between calls
        time.sleep(2)

    def scrape(self):
        # Looping the different combination of queries and places
        if len(self.locations) > 0:
            # Looping queries and places 
            for a in self.queries:
                for b in self.locations:
                    query = ''.join(map(str, a))
                    # URL encode the query and add quotes around it
                    encoded_query = '"' + urllib.parse.quote_plus(query) + '"'
                    place = urllib.parse.quote_plus(''.join(map(str, b)).upper() + ":" + ''.join(map(str, b)).lower()) 
                    # Compose the URL
                    url = self.base_url + encoded_query + "&hl=" + self.language + "&ceid=" + place 
                    # print("Reading now: ", url)
                    # Read the Feed
                    self.readFeed(url, query)
        else: 
            # Just use the query(ies)
            for a in self.queries:   
                query = ''.join(map(str, a))
                # URL encode the query and add quotes around it
                encoded_query = '"' + urllib.parse.quote_plus(query) + '"'        
                # Compose the URL    
                url = self.base_url + encoded_query
                # print("Reading now: ",url)
                # Read the Feed
                self.readFeed(url, query)

        # Set the file name
        cleanQuery = re.sub('\W+','', query)
        file_name = cleanQuery + ".csv"

        df = pd.DataFrame(self.d, columns=('Title', 'Link', 'pubDate', 'Description','Source', 'Query', 'Alexa Rank'))

                                           # Remove all rows with the same link - you might want to comment this when using different keywords
        df.drop_duplicates(subset ="Link", keep = False, inplace = True)
        return df

        # Store data to CSV
        # df.to_csv(file_name, encoding='utf-8', index=False)
        # print(len(df), "Articles saved on ", file_name)




# queries = ['Apple', 'Microsoft']
# locations = ['United States']
# language = 'en'

# scraper = NewsScraper(queries, language, locations)
# df = scraper.scrape()

# articles = []

# for _, row in df.iterrows():
#     article = {
#         'title': row['Title'],
#         'link': row['Link'],
#         'pub_date': row['pubDate'],
#         'description': row['Description'],
#         'source': row['Source'],
#         'query': row['Query'],
#         'alexa_rank': row['Alexa Rank']
#     }
#     articles.append(article)

# print(articles)
# df


In [58]:
import asyncio

async def main():
    queries = ['Singapore', 'Malaysia']
    locations = ['Singapore']
    language = 'en'

    scraper = NewsScraper(queries, language, locations)
    df = scraper.scrape()

    articles = []

    for _, row in df.iterrows():
        article = {
            'title': row['Title'],
            'link': row['Link'],
            'pub_date': row['pubDate'],
            'description': row['Description'],
            'source': row['Source'],
            'query': row['Query'],
            'alexa_rank': row['Alexa Rank']
        }
        articles.append(article)

    print(articles)
    df

    # 1. Rank the DataFrame by the 'Link' column
    df = df.sort_values(by=['Link'])

    # 2. Group similar news descriptions, create a new column concatenating those descriptions, and summarize the description items.
    df['concatenated_description'] = df.groupby('Source')['Description'].transform(lambda x: ' '.join(x))
    df = df.drop_duplicates(subset='Source').reset_index(drop=True)

    # 3. Create a story out of the description and limit the DataFrame to only 5 rows.
    df = df.head(7)
    story_prompts = ["summarize the following news descriptions: in 15 words " + desc for desc in df['concatenated_description']]
    stories = await asyncio.gather(*(generate_text(prompt) for prompt in story_prompts))
    print(stories)
    df['story'] = stories
    df.to_csv('news.csv', index=False)

await main()


[{'title': 'Opinion | He Made His Country Rich, but Something Has Gone Wrong With the System - The New York Times', 'link': 'https://news.google.com/rss/articles/CBMiYWh0dHBzOi8vd3d3Lm55dGltZXMuY29tLzIwMjMvMDQvMTIvb3Bpbmlvbi9pbnRlcm5hdGlvbmFsLXdvcmxkL3NpbmdhcG9yZS1hdXRvY3JhY3ktZGVtb2NyYWN5Lmh0bWzSAQA?oc=5', 'pub_date': '2023/04/12', 'description': 'Opinion | He Made His Country Rich, but Something Has Gone Wrong With the System&nbsp;&nbsp;The New York Times', 'source': 'The New York Times', 'query': 'Singapore', 'alexa_rank': None}, {'title': 'Singapore no place for welfarism or ‘populist ideas’, No 2 leader says - South China Morning Post', 'link': 'https://news.google.com/rss/articles/CBMijAFodHRwczovL3d3dy5zY21wLmNvbS93ZWVrLWFzaWEvcG9saXRpY3MvYXJ0aWNsZS8zMjE3MzY5L3NpbmdhcG9yZXMtbmV4dC1sZWFkZXItbGF3cmVuY2Utd29uZy1maXJlcy1mcmVzaC1zYWx2by1vcHBvc2l0aW9uLWFuZC1wb3B1bGlzdC1pZGVhc9IBjAFodHRwczovL2FtcC5zY21wLmNvbS93ZWVrLWFzaWEvcG9saXRpY3MvYXJ0aWNsZS8zMjE3MzY5L3NpbmdhcG9yZXMtbmV4dC1sZWFkZXIt