# Data Download

We deal with downloading data from the following sources:
- [goperigon.com](https://goperigon.com)
- [newsdata.io](https://newsdata.io)
- [mediastack.com](https://mediastack.com)
- [thenewsapi.com](https://thenewsapi.com)
- [marketaux.com](https://marketaux.com)


In [38]:
import requests
import logging
import json
import pandas as pd
import numpy as np
import re
from datetime import datetime

_LOG = logging.getLogger(__name__)

In [47]:
# Set the display options for the dataframes
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.colheader_justify', 'right')
pd.set_option('display.width', 100)

## goperigon.com pipeline

In [225]:
class goperigonPipeline:
    """
    Handles fetching data from the goperigon.com API.
    """
    def __init__(self, api_key: str = 'f745aea8-78bf-4a63-b98d-ba22320b90ad'):
        """
        Initializes the pipeline.

        :param api_key: API key for authentication.
        """
        self.api_key = api_key
        self.base_url = "https://api.goperigon.com/v1/"
        self.headers = {"Authorization": f"Bearer {api_key}"}
        _LOG.debug(f"{self.__class__.__name__} initialized with API key: {api_key}")

    def set_query_params(self, endpoint: str = 'all', **kwargs) -> None:
        """
        Specify the API endpoint and the query parameters for fetching data.

        :param endpoint: The API endpoint to fetch data from.
            - 'all'        : Provides functionality for searching and filtering all 
                             news articles available on the API.
            - 'stories/all': Fetches data from all the sources.
            - 'sources'    : Provides functionality to search and filter for media 
                             sources from around the world.
            - 'journalists': Provides functionality for searching and filtering all journalists 
                             available on the database. 
            - 'people/all' : Search and retrieve additional information on known persons that 
                             exist within Perigon's 
                             entity database and as referenced in any article response object.
            - 'companies'  : Search and retrieve additional information on companies that exist within Perigon's 
                             entity database and as referenced in any article response object.
        :param **kwargs: The query parameters to be passed. E.g., 'category', 'topic', etc. Visit 
                        `https://docs.goperigon.com/docs/getting-started` for more information on query params for each endpoint.
        """
        # Check if kwargs is empty and assign default query params.
        if not kwargs:  
            self.params = {
            "category": 'Finance',
            "topic": 'Cryptocurrency'
            }
        else:
            self.params = kwargs
        # Assign the input endpoint as an instance variable.
        if endpoint in ['all', 'stories/all', 'sources', 'journalists', 'people/all', 'companies']:
            self.endpoint = endpoint
        else:
            raise ValueError(f"Invalid endpoint: {endpoint}")
    
    def __process_data(self, response) -> pd.DataFrame:
        """
        Clean and store the JSON response in a DataFrame.
        
        :param response: JSON response of the API request.
        :return: Dataframe of the JSON response.
        """
        # Convert the JSON response into a dataframe.
        for article in response['articles']:
            for column in ['title', 'description', 'summary']:
                if article[column] is not None:
                    # Replace newline characters with space.
                    article[column] = article[column].replace("\n", " ")
                    # Replace multiple spaces with a single space.
                    article[column] = re.sub(r"\s+", " ", article[column])
                    # Strip leading and trailing spaces.
                    article[column] = article[column].strip()
        df = pd.json_normalize(response['articles'])
        # Convert dates to datetime objects.
        for column in ['pubDate', 'addDate', 'refreshDate']:
            df[column] = pd.to_datetime(df[column])
            # Check if datetime objects are tz-aware; if not, localize to UTC before converting.
            if df[column].dt.tz is None:
                df[column] = df[column].dt.tz_localize('UTC')
            df[column] = df[column].dt.tz_convert(None)
        # Unpack column values.
        for column in ['keywords', 'topics', 'categories', 'entities']:
            exploded_df = df.explode(column)
            # Normalize the data in column.
            normalized_df = pd.json_normalize(exploded_df[column])
            # Format strings to be enclosed in double quotes.
            normalized_df = normalized_df.map(lambda x: f'"{x}"')
            # Aggregate the values by the index.
            aggregated = normalized_df.groupby(by=exploded_df.index).agg(list)
            # Rename the columns to follow the format "{column}.{key}".
            aggregated.columns = [column + "." + col for col in aggregated.columns]
            # Drop the original column.
            df = df.drop(column, axis=1)
            # Merge or concatenate this aggregated data back to your original DataFrame
            df = df.join(aggregated)
        return df 
        
    def fetch_data(self) -> pd.DataFrame:
        """
        Fetches data from a specified endpoint of the goperigon.com API.

        :return: Dataframe of the JSON response from the API.
        """
        # Create the URL to fetch the data specifying the endpoint.
        url = self.base_url + self.endpoint
        # Fetch data from the API endpoint with the passed parameters.
        try:
            response = requests.get(url=url, headers=self.headers, params=self.params)
            _LOG.debug(f"Fetching data from {response.request.url}")
            response.raise_for_status()
            _LOG.info("Data fetched successfully.")
            self.json_response = response.json()
        except requests.RequestException as e:
            _LOG.error(f"Error fetching data: {e}")
            raise
        # Compile the data in a dataframe.
        df = self.__process_data(self.json_response)
        return df
        

### Sample data


In [226]:
datapipeline_1 = goperigonPipeline()
datapipeline_1.set_query_params()
df1 = datapipeline_1.fetch_data()
df1.head()

Unnamed: 0,url,authorsByline,articleId,clusterId,imageUrl,country,language,pubDate,addDate,refreshDate,score,title,description,content,medium,labels,matchedAuthors,claim,verdict,summary,translation,locations,reprint,places,source.domain,source.location,sentiment.positive,sentiment.negative,sentiment.neutral,keywords.name,keywords.weight,topics.name,categories.name,entities.data,entities.type,entities.mentions
0,https://www.marketpulse.com/20210820/commodities-cryptos-oil-struggles-demand-outlook-strong-dollar-gold-hits-wall-bitcoin-coiling/,,9be5702e780b472da9abee76c30d65c3,445e1bf9ea2e4fb0bc2065e55c78311e,https://www.marketpulse.com/wp-content/uploads/2021/06/AdobeStock_224470162.jpeg,us,en,2021-08-20 15:34:07,2021-08-22 05:50:35.021786,2021-08-22 05:50:35.021786,2.0,"Commodities and Cryptos: Oil struggles over demand outlook and strong dollar, Gold hits a wall, Bitcoin coiling up","Commodities and Cryptos: Oil struggles over demand outlook and strong dollar, Gold hits a wall, Bitcoin coiling up MarketPulse",Delta variant cases continue to wreak havoc over the short-term crude demand outlook. Oil prices have been in freefall as Wall Street turns cautious over delta variant jitters and as Fed taper expecta... [3313 symbols],Article,[],[],,,"Oil prices have been in freefall as Wall Street turns cautious over delta variant jitters and as Fed taper expectations boost the dollar. The Charles Schwab Active Trader Pulse Survey showed that active traders that have not changed strategies due to delta are leaning toward increasing their cash exposure and decreasing overall equities exposure. If Fed Chair Powell taps the brakes on the Fed’s plans over tapering, that could be the catalyst to take prices above the $1800 level. The price of Bitcoin could break above the $50,000 level next week if Fed Chair Powell decides he wants to slow up the Fed’s plans on tapering.",,[],False,,marketpulse.com,,0.468117,0.446074,0.085809,"[""overall equities exposure"", ""delta variant jitters"", ""Fed Chair Powell"", ""Delta variant cases"", ""September 1st"", ""Fed taper expectations"", ""Wall Street"", ""Fed"", ""many active traders"", ""crude prices""]","[""0.08665211"", ""0.07696954"", ""0.074482255"", ""0.0736213"", ""0.07168974"", ""0.07140076"", ""0.07022602"", ""0.06344405"", ""0.063404724"", ""0.058125477""]","[""Bitcoin"", ""Cryptocurrency"", ""Markets""]","[""Finance""]","[""Powell"", ""Fed"", ""OPEC+"", ""Charles Schwab"", ""Treasury"", ""OANDA Corporation"", ""US"", ""Asia"", ""Jackson Hole""]","[""PERSON"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""GPE"", ""LOC"", ""EVENT""]","[""3.0"", ""5.0"", ""2.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0""]"
1,https://www.benzinga.com/money/coinbase-vs-voyager/,Micah C. Miracle,e3e4a29fc371468683b068bc4b82e25a,031c8fd7135f42e3acf1404c20781061,https://cdnwp-s3.benzinga.com/wp-content/uploads/2019/08/23211021/AdobeStock_188417474.jpeg,us,en,2021-08-20 15:46:20,2021-08-22 05:47:50.916683,2021-08-22 05:47:50.916683,2.0,Voyager vs. Coinbase • Exchange Comparison • Benzinga,Voyager vs. Coinbase • Exchange Comparison • Benzinga Benzinga,"Want to jump straight to the answer? Voyager is better for staking and Coinbase is better for trading.\n\nWhether you prefer a decentralized exchange (DEX) or a traditional trading platform, the options... [6282 symbols]",Article,[],[],,,"And with a number of altcoins making headlines again, exchanges are pressured to increase their offerings. A few things stand out for cryptocurrency exchanges when compared to stock exchanges that deal primarily with equities. Most importantly, cryptocurrency exchanges have conformed to the regulatory demands of their specific industry. While some platforms, like Robinhood, allow for the trading of crypto and stocks, cryptocurrency exchanges are more often exclusively geared toward trading digital assets.",,[],False,,benzinga.com,,0.050433,0.035838,0.913729,"[""cryptocurrency exchanges"", ""exchanges"", ""stock exchanges"", ""multiple exchanges"", ""reputable exchanges"", ""Coinbase customers"", ""Coinbase fees"", ""regulated North American exchanges"", ""Coinbase Pro"", ""Coinbase traffic""]","[""0.10764303"", ""0.099957354"", ""0.099785134"", ""0.09959504"", ""0.096862905"", ""0.087101586"", ""0.0859258"", ""0.08557153"", ""0.082459144"", ""0.08056351""]","[""Coinbase"", ""Cryptocurrency""]","[""Finance"", ""Tech""]","[""Coinbase"", ""Voyager"", ""Robinhood"", ""Voyager"", ""Coinbase Pro"", ""a Smart Order Router"", ""Android"", ""iPhone"", ""U.S."", ""North American""]","[""ORG"", ""ORG"", ""ORG"", ""PRODUCT"", ""PRODUCT"", ""PRODUCT"", ""PRODUCT"", ""PRODUCT"", ""GPE"", ""NORP""]","[""20.0"", ""11.0"", ""1.0"", ""7.0"", ""3.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0""]"
2,https://www.investing.com/news/cryptocurrency-news/wells-fargo-and-jpmorgan-both-registered--bitcoin-fund-with-sec-2596106,CoinQuora,5557b045741941ba83661eda4e8e613e,61a5e915fa484c8c842d391aeb86110f,https://i-invdn-com.investing.com/news/LYNXMPEDB30UQ_L.jpg,us,en,2021-08-21 08:00:00,2021-08-24 06:52:41.800498,2021-08-24 06:52:41.800498,2.0,Wells Fargo and JPMorgan Both Registered Bitcoin Fund With SEC By CoinQuora,Wells Fargo and JPMorgan Both Registered Bitcoin Fund With SEC By CoinQuora Investing.com,Bitcoin (BTC) may be the last thing one might consider when thinking about ways to ensure a clean-energy future. This has especially become the case following a tweet sent by tech...,Article,[],[],,,Bitcoin (BTC) may be the last thing one might consider when thinking about ways to ensure a clean-energy future. This has especially become the case following a tweet sent by tech...,,[],True,,investing.com,,0.113863,0.070369,0.815768,"[""tech"", ""ways"", ""a clean-energy future"", ""BTC"", ""the last thing"", ""Bitcoin"", ""a tweet"", ""the case"", ""one""]","[""0.15203515"", ""0.12833458"", ""0.10613389"", ""0.07957909"", ""0.0748607"", ""0.070247345"", ""0.0471701"", ""0.03675601"", ""0.0""]","[""Bitcoin"", ""Cryptocurrency"", ""JPMorgan"", ""Markets"", ""SEC"", ""Wells Fargo""]","[""Finance""]","[""nan""]","[""nan""]","[""nan""]"
3,https://www.benzinga.com/pressreleases/21/08/g22606730/polyplay-integrating-crypto-currencies-to-the-real-world,Globe Newswire,29c386004945426394af8a54bfbedc86,4b9b098299cb4d90bf602a7b1c14758b,https://cdn.benzinga.com/files/imagecache/og_image_social_share_1200x630/sites/all/themes/bz2/images/bz-icon.png,us,en,2021-08-21 19:39:00,2021-08-24 06:52:41.806063,2021-08-24 06:52:41.806063,2.0,PolyPlay: Integrating Crypto-Currencies To The Real World - Benzinga,PolyPlay: Integrating Crypto-Currencies To The Real World - Benzinga Benzinga,"Toronto, Canada, Aug. 21, 2021 (GLOBE NEWSWIRE) -- The last two years have been a game-changer for the e-sport industry, especially within the Blockchain space. In 2020, the general e-sport industry g... [4316 symbols]",Article,[],[],,,"During this period, a new generation of gamers rose, including children who have shown profound skills. PolyPlay is a blockchain-based e-sports platform created to host e-sport tournaments for amateur players. PolyPlay aims to host E-sport tournaments for all players in a professional manner despite their skill levels. The PolyPlay ecosystem has a native token called PLAY, a total supply of 1 million PLAY.",,[],True,,benzinga.com,,0.194076,0.010075,0.795849,"[""E-sport tournaments"", ""e-sport tournaments"", ""PLAY"", ""new gaming projects"", ""profound skills"", ""Gamers"", ""gamers"", ""gaming"", ""amateur players"", ""official PolyPlay gear""]","[""0.082520135"", ""0.082520135"", ""0.08045774"", ""0.076019704"", ""0.068887725"", ""0.0678535"", ""0.0678535"", ""0.06350775"", ""0.05906137"", ""0.057903588""]","[""Cryptocurrency""]","[""Finance""]","[""BC Simon"", ""Pritesh Kucheira"", ""PolyPlay"", ""YouTube"", ""PCS"", ""Oxbull"", ""DEX"", ""CEX"", ""PLayBet Casino"", ""DOTA2"", ""LOL"", ""CS:"", ""FIFA"", ""NFTs Card Game"", ""Toronto"", ""Canada"", ""Cardano"", ""Elrond""]","[""PERSON"", ""PERSON"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""PRODUCT"", ""PRODUCT"", ""PRODUCT"", ""PRODUCT"", ""PRODUCT"", ""GPE"", ""GPE"", ""GPE"", ""GPE""]","[""1.0"", ""1.0"", ""18.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0""]"
4,https://kryptomoney.com/wells-fargo-to-launch-bitcoin-investment-fund-for-rich-clients/,,4f082111f0fe4af5a45126a7686c3ecc,6ad7d19bb8c142e49d5fd9885d561e5e,https://kryptomoney.com/wp-content/uploads/2021/01/bitcoin-4481815_640-e1624986314941.jpg,us,en,2021-08-20 15:47:46,2021-08-22 05:51:57.680426,2021-08-22 05:51:57.680426,2.0,Wells Fargo To Launch Bitcoin Investment Fund For Rich Clients,Wells Fargo To Launch Bitcoin Investment Fund For Rich Clients KryptoMoney,Wells Fargo has registered a Bitcoin investment fund for its wealthy clients. The mega bank is offering select clients this service to help them access some exposure to Bitcoin.\n\nA Form D (Notice of E... [1353 symbols],Article,[],[],,,"A Form D (Notice of Exempt Offering of Securities) filing with the United States Securities and Exchange Commission (SEC) revealed that the fund is provided as part of a limited partnership with alternative assets management service FS Investments, and the New York Digital Investment Group (NYDIG). The filing revealed that the fund is dubbed “FS NYDIG BITCOIN FUND I” and also that Wells Fargo Clearing Services will handle servicing and placement fees for clients. Just like Wells Fargo, JPMorgan is restricting access to wealthy clients and is also collaborating with the NYDIG. The bank’s entry into the crypto sector is particularly interesting, considering that CEO Jamie Dimon is popular for his dislike of the crypto sector.",,[],True,,kryptomoney.com,,0.068244,0.017814,0.913941,"[""CEO Jamie Dimon"", ""FS NYDIG"", ""wealthy clients"", ""NYDIG"", ""Wells Fargo Clearing Services"", ""Jamie Dimon"", ""clients"", ""Wells Fargo"", ""alternative assets management service FS Investments"", ""“FS NYDIG BITCOIN FUND I""]","[""0.11179761"", ""0.11065052"", ""0.10870049"", ""0.10132371"", ""0.09702056"", ""0.096657015"", ""0.09642714"", ""0.093534894"", ""0.08896825"", ""0.08611909""]","[""Bitcoin"", ""Cryptocurrency"", ""Wells Fargo""]","[""Finance""]","[""Jamie Dimon"", ""Wells Fargo"", ""JPMorgan Chase"", ""the United States Securities and Exchange Commission"", ""SEC"", ""FS Investments"", ""the New York Digital Investment Group"", ""FS NYDIG"", ""Wells Fargo Clearing Services"", ""BNY Mellon"", ""Morgan Stanley"", ""Goldman Sachs"", ""NYDIG""]","[""PERSON"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""ORG"", ""ORG""]","[""1.0"", ""3.0"", ""3.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0"", ""1.0""]"


## newsdata.io pipeline

In [185]:
class newsdataPipeline:
    """
    Handles fetching data from the newsdata.io API. 
    """
    def __init__(self, api_key: str = 'pub_41599cecccb2e7df362a6da41983f3f8729dd'):
        """
        Initializes the pipeline. 

        :param api_key: API key for authentication.
        """
        self.base_url = 'https://newsdata.io/api/1/'
        self.headers = {"X-ACCESS-KEY": api_key}
        _LOG.debug(f"{self.__class__.__name__} initialized with API key: {api_key}")

    def set_query_params(self, endpoint: str = 'news', **kwargs) -> None:
        """
        Specify the query parameters for fetching data. 
        
        :param endpoint: The API endpoint to fetch data from.
            - 'news'    : Provides access to the latest and breaking news upto the past 48 hrs.
            - 'crypto'  : Provided crypto related news and blog data.
            - 'archive' : Provides access to the old news data upto past 2 years.
            - 'sources' : Provides names of randomly selected 100 domains from a country, category or/and language.
        :param **kwargs: The query parameters to be passed. Visit `https://newsdata.io/documentation` 
                         for more information on query params for each endpoint.
        """
        # Check if kwargs is empty and assign default values.
        if not kwargs:
            self.params = {
                'q': "cryptocurrency"
                 }   
        else:
            self.params = kwargs
         # Assign the input endpoint as an instance variable.
        if endpoint in ['news', 'crypto', 'archive', 'sources']:
            self.endpoint = endpoint
        else:
            raise ValueError(f"Invalid endpoint: {endpoint}")   
    
    def __process_data(self, response) -> pd.DataFrame:
        """
        Clean and store the JSON response in a DataFrame.
        
        :param response: JSON response of the API request.
        :return: Dataframe of the JSON response.
        """
        # Convert the JSON response into a dataframe.
        for article in response['results']:
            for column in ['title', 'description', 'content']:
                if article[column] is not None:
                    # Replace newline characters with space
                    article[column] = article[column].replace("\n", " ")
                    # Replace multiple spaces with a single space
                    article[column] = re.sub(r"\s+", " ", article[column])
                    # Strip leading and trailing spaces
                    article[column] = article[column].strip()
        df = pd.json_normalize(response['results'])
        # Convert dates to datetime objects.
        for column in ['pubDate']:
            df[column] = pd.to_datetime(df[column])
            # Check if datetime objects are tz-aware; if not, localize to UTC before converting
            if df[column].dt.tz is None:
                df[column] = df[column].dt.tz_localize('UTC')
            df[column] = df[column].dt.tz_convert(None)
        return df 
        

    def fetch_data(self) -> dict:
        """
        Fetches news data from NewsData.io.

        :return: The JSON response from the API. 
        """
        # Create the URL to fetch the data specifying.
        url = self.base_url + self.endpoint
        # Fetch data from the API with the passed parameters.
        try:
            response = requests.get(url=url, headers=self.headers, params=self.params)
            _LOG.debug(f"Fetching data from {response.request.url}")
            response.raise_for_status()
            _LOG.info("Data fetched successfully.")
            self.json_response = response.json()
        except requests.RequestException as e:
            _LOG.error(f"Error fetching data: {e}")
            raise
        # Compile the data in a dataframe.
        df = self.__process_data(self.json_response)
        return df



### Sample Data

In [186]:
datapipeline_2 = newsdataPipeline()
datapipeline_2.set_query_params()
df_newsdata = datapipeline_2.fetch_data()
df_newsdata.head()

Unnamed: 0,article_id,title,link,keywords,creator,video_url,description,content,pubDate,image_url,source_id,source_priority,source_url,source_icon,language,country,category,ai_tag,sentiment,sentiment_stats,ai_region
0,5e8936b0f79b89d2721e38d786f626c7,Make Money With Crypto By Investing In Floki Inu As Scorpion Casino Plans $8M PinkSale Strategy For PancakeSwap & CEX Success,https://www.tekedia.com/make-money-with-crypto-by-investing-in-floki-inu-as-scorpion-casino-plans-8m-pinksale-strategy-for-pancakeswap-cex-success/,[community insights],[TI Partners],,"The cryptocurrency market offers a diverse range of investment opportunities, with established projects like Floki Inu and innovative ventures like Scorpion Casino capturing investor attention. This article examines the potential of Floki Inu as an investment and delves into Scorpion Casino’s presale strategy on PinkSale.finance, designed to propel the platform towards success on PancakeSwap and […] The post Make Money With Crypto By Investing In Floki Inu As Scorpion Casino Plans $8M PinkSale Strategy For PancakeSwap & CEX Success appeared first on Tekedia.",ONLY AVAILABLE IN PAID PLANS,2024-04-10 21:55:50,https://tkcdn.tekedia.com/wp-content/uploads/2024/04/10175538/scorpion-casino-6-768x650.jpg,tekedia,422136,https://www.tekedia.com,https://i.bytvi.com/domain_icons/tekedia.jpg,english,[nigeria],[top],ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN CORPORATE PLANS
1,7c8f76f48022199e843fc4e24c0dd8a7,Scorpion Casino ($SCORP) Unveils a $8M PinkSale Strategy to Rival Dogecoin ($DOGE) & Shiba Inu ($SHIB),https://www.tekedia.com/scorpion-casino-scorp-unveils-a-8m-pinksale-strategy-to-rival-dogecoin-doge-shiba-inu-shib/,[community insights],[TI Partners],,"The realm of meme coins like Dogecoin ($DOGE) and Shiba Inu ($SHIB) has taken the crypto world by storm. Their playful branding and vibrant online communities have captured the hearts of many investors. But a new breed of crypto project is emerging, offering more than just hype. Scorpion Casino is a revolutionary online casino that […] The post Scorpion Casino ($SCORP) Unveils a $8M PinkSale Strategy to Rival Dogecoin ($DOGE) & Shiba Inu ($SHIB) appeared first on Tekedia.",ONLY AVAILABLE IN PAID PLANS,2024-04-10 21:53:23,https://tkcdn.tekedia.com/wp-content/uploads/2024/04/02062924/scorpion-casino-logo-768x440.jpg,tekedia,422136,https://www.tekedia.com,https://i.bytvi.com/domain_icons/tekedia.jpg,english,[nigeria],[top],ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN CORPORATE PLANS
2,3d62afe653435771e830fc8757c9fccc,Prepare For The Next Crypto Pump With Polkadot As Scorpion Casino Launches April 15th,https://www.tekedia.com/prepare-for-the-next-crypto-pump-with-polkadot-as-scorpion-casino-launches-april-15th/,[community insights],[TI Partners],,"The crypto market is abuzz with anticipation for the next big pump! While established players like Bitcoin and Ethereum are always in the spotlight, savvy investors are looking beyond the usual suspects. This article delves into two exciting altcoins with the potential to explode in value: Polkadot (DOT), and the intriguing newcomer, Scorpion Casino (SCORP). […] The post Prepare For The Next Crypto Pump With Polkadot As Scorpion Casino Launches April 15th appeared first on Tekedia.",ONLY AVAILABLE IN PAID PLANS,2024-04-10 21:51:19,https://tkcdn.tekedia.com/wp-content/uploads/2024/02/25082014/scorpion-casino-2-768x479.jpg,tekedia,422136,https://www.tekedia.com,https://i.bytvi.com/domain_icons/tekedia.jpg,english,[nigeria],[top],ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN CORPORATE PLANS
3,6d08c48c78d406f881712f3ff2672819,"If You Can Only Buy One Meme Stock in April, It Better Be One of These 3 Names",https://investorplace.com/2024/04/if-you-can-only-buy-one-meme-stock-in-april-it-better-be-one-of-these-3-names/,"[stocks to buy, nasdaq:coin, nyse:uber, nasdaq:amd, stocks to buy]",[Matthew Farley],,"InvestorPlace - Stock Market News, Stock Advice & Trading Tips Seize the bull market. These three meme stocks to buy are primed for potential profits amidst rising investor sentiment and market momentum. The post If You Can Only Buy One Meme Stock in April, It Better Be One of These 3 Names appeared first on InvestorPlace. More From InvestorPlace The #1 AI Investment Might Be This Company You’ve Never Heard Of Musk’s “Project Omega” May Be Set to Mint New Millionaires. Here’s How to Get In. It doesn’t matter if you have $500 or $5 million. Do this now.",ONLY AVAILABLE IN PAID PLANS,2024-04-10 21:47:27,https://investorplace.com/wp-content/uploads/2021/12/meme-stocks-768x432.png,investorplace,3707,https://investorplace.com,https://i.bytvi.com/domain_icons/investorplace.png,english,[united states of america],[top],ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN CORPORATE PLANS
4,4b968a20dc2392acf8096ee5567319a9,Wall Street falls after hot inflation report burns hopes for a June rate cut,https://www.latimes.com/business/story/2024-04-10/stock-market-today-wall-street-falls-after-hot-inflation-report-burns-hopes-for-a-june-rate-cut,,[Stan Choe],,U.S. stocks fell on worries that what seemed like a blip in the battle to bring down inflation is turning into a troubling trend.,ONLY AVAILABLE IN PAID PLANS,2024-04-10 21:35:45,,latimes,267,http://www.latimes.com,https://i.bytvi.com/domain_icons/latimes.png,english,[united states of america],[top],ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN PROFESSIONAL AND CORPORATE PLANS,ONLY AVAILABLE IN CORPORATE PLANS


## thenewsapi.com pipeline

In [187]:
class newsapiPipeline:
    """
    Handles fetching data from the thenewsapi.com API.
    """
    def __init__(self, api_key: str = 'Nr11NvvuUR3VkVqFbWQpDFxdAA9wClTkofsFtQ9l'):
        """
        Intializes the pipeline.

        :param api_key: API key for authentication.
        """
        # Assign the passed API to an instance variable.
        self.api_key = api_key
        self.base_url = 'https://api.thenewsapi.com/v1/news/'
        _LOG.debug(f"{self.__class__.__name__} initialized with API key: {api_key}")

    def set_query_params(self, endpoint: str = 'all', **kwargs) -> None:
        """
        Specify the query parameters for fetching data. 
        
        :param endpoint: The API endpoint to fetch data from.
            - 'headlines' : Get the latest headlines by category.
            - 'top'       : Find live and historical top stories around the world. 
            - 'all'       : Find all live and historical articles.
            - 'similar'   : Find similar stories to a specific article based on its UUID.
            - 'uuid'      : Find specific articles by the UUID.
            - 'sources'   : Find sources to use in your news API requests.
        :param **kwargs: The query parameters to be passed. Visit `https://www.thenewsapi.com/documentation` 
                         for more information on query params for each endpoint.
        """
        # Check if kwargs is empty and assign default values.
        if not kwargs:
            self.params = {
                'api_token': self.api_key,
                'search': "crypto | cryptocurrency"
                 }   
        else:
            self.params = kwargs
            # Pass API key as query paramter.
            self.params['api_token']=self.api_key
         # Assign the input endpoint as an instance variable.
        if endpoint in ['headlines', 'top', 'all', 'similar', 'uuid', 'sources']:
            self.endpoint = endpoint
        else:
            raise ValueError(f"Invalid endpoint: {endpoint}")    
     
    def __process_data(self, response) -> pd.DataFrame:
        """
        Clean and store the JSON response in a DataFrame.
        
        :param response: JSON response of the API request.
        :return: Dataframe of the JSON response.
        """
        # Convert the JSON response into a dataframe.
        for article in response['data']:
            for column in ['title', 'description', 'snippet']:
                if article[column] is not None:
                    # Replace newline characters with space
                    article[column] = article[column].replace("\n", " ")
                    # Replace multiple spaces with a single space
                    article[column] = re.sub(r"\s+", " ", article[column])
                    # Strip leading and trailing spaces
                    article[column] = article[column].strip()
        df = pd.json_normalize(response['data'])
        # Convert dates to datetime objects.
        for column in ['published_at']:
            df[column] = pd.to_datetime(df[column])
            # Check if datetime objects are tz-aware; if not, localize to UTC before converting
            if df[column].dt.tz is None:
                df[column] = df[column].dt.tz_localize('UTC')
            df[column] = df[column].dt.tz_convert(None)
        return df

    def fetch_data(self, uuid: str=None) -> dict:
        """
        Fetches top news data from thenewsapi.com.
        
        :param uuid: UUID value if `endpoint = 'similar'` or `endpoint = 'uuid'`.
        :return: The JSON response from the API.
        """
        # Check if endpoint if 'uuid' or 'similar' and create the URL to fetch data.
        if self.endpoint in ['uuid', 'similar']:
            if uuid is not None: 
                url = self.base_url + self.endpoint + '/' + uuid
            else:
                raise ValueError("Enter Valid UUID.")
        else:
            url = self.base_url + self.endpoint        
        # Fetch data from the API with the passed parameters.
        try:
            response = requests.get(url=url, params=self.params)
            _LOG.debug(f"Fetching data from {response.request.url}")
            response.raise_for_status()
            _LOG.info("Data fetched successfully.")
            self.json_response = response.json()
        except requests.RequestException as e:
            _LOG.error(f"Error fetching data: {e}")
            raise
        df = self.__process_data(self.json_response)
        return df


### sample data

In [188]:
datapipeline_3 = newsapiPipeline()
datapipeline_3.set_query_params()
df_newsapi = datapipeline_3.fetch_data()
df_newsapi.head()

Unnamed: 0,uuid,title,description,keywords,snippet,url,image_url,language,published_at,source,categories,relevance_score
0,c4cf5c7c-aa6f-4aa9-b20f-e143ef18b545,Cryptocurrency turmoil affects crypto miners,"Cryptocurrency turmoil is affecting crypto miners, who require lots of electricity to operate. Energy costs are up, and demand is down.",,"Cryptocurrency turmoil affects crypto miners Cryptocurrency turmoil is affecting crypto miners, who require lots of electricity to operate. Energy costs are up,...",https://www.npr.org/2022/12/15/1143139773/cryptocurrency-turmoil-affects-crypto-miners,https://media.npr.org/include/images/facebook-default-wide-s1400-c100.jpg,en,2022-12-15 21:19:46,npr.org,"[general, politics]",41.48519
1,c9e1c666-7008-43bd-a5f6-8bb6153b4370,Crypto and cryptocurrency,,,Crypto and cryptocurrency Crypto and cryptocurrency startups are garnering attention in the sphere due to their rapid growth and potential for significant ret...,https://magnetmartpk.medium.com/crypto-and-cryptocurrency-2a176774c72a,https://miro.medium.com/v2/1*m-R_BkNf1Qjr1YbyOIJY2w.png,en,2024-02-14 20:37:06,medium.com,"[tech, science, business]",41.12172
2,1f6a6370-3cae-4024-b567-4ebc216948a9,What Is Staking Cryptocurrency (Crypto)?,"Learn about cryptocurrency staking, how it works, and the pros and cons of this investment.",,"Earning money while you sleep is the dream. And although there are plenty of passive income opportunities in traditional finance, a fairly new income stream th...",https://www.moneycrashers.com/staking-cryptocurrency-meaning/,https://www.moneycrashers.com/wp-content/uploads/2021/10/cropped-moneycrashers-square-logo-large-32x32.png,en,2022-08-03 19:00:00,moneycrashers.com,[business],40.37113


## marketaux.com pipeline

In [217]:
class marketauxPipeline:
    """
    Handles fetching data from the marketaux.com API.
    """
    def __init__(self, api_key: str = 'shRM7CyfkEGsBK83T3IB7YweAiKbzORMZjEWERtu'):
        """
        Intializes the pipeline.

        :param api_key: API key for authentication.
        """
        # Assign the passed API to an instance variable.
        self.api_key = api_key
        self.base_url = 'https://api.marketaux.com/v1/'
        _LOG.debug(f"{self.__class__.__name__} initialized with API key: {api_key}")

    def set_query_params(self, endpoint: str = 'news/all', **kwargs) -> None:
        """
        Specify the query parameters for fetching data. 
        
        :param endpoint: The API endpoint to fetch data from.
            - 'news/all'       : Find all live and historical articles.
            - 'news/similar'   : Find similar stories to a specific article based on its UUID.
            - 'news/uuid'      : Find specific articles by the UUID.
            - 'news/sources'   : Find sources to use in your news API requests.
            - 'entity/stats'   : Get an intraday view of how well entities performed over 
                                 different intervals using this endpoint.
            - 'entity/stats/aggregation'    : Returns an aggregation of entities for a single time frame, 
                                              rather than being broken down by date.
            - 'entity/trending/aggregation' : Use this endpoint to identify trending entities.
            - 'entity/search'               : Use this endpoint to search for all supported entities.
            - 'entity/type/list'            : Use this endpoint to return all supported entity types.
            - 'entity/industry/list'        : Use this endpoint to return all supported entity industries.
        :param **kwargs: The query parameters to be passed. Visit `https://www.marketaux.com/documentation` 
                         for more information on query params for each endpoint.
        """
        # Check if kwargs is empty and assign default values.
        if not kwargs:
            self.params = {
                'api_token': self.api_key,
                'search': "crypto | cryptocurrency"
                 }   
        else:
            self.params = kwargs
            # Pass API key as query paramter.
            self.params['api_token']=self.api_key
         # Assign the input endpoint as an instance variable.
        if endpoint in ['news/all', 'news/similar', 'news/uuid', 'news/sources', 'entity/stats', 
                        'entity/stats/aggregation', 'entity/tending/aggregation', 'entity/search'
                        'entity/type/list', 'entity/industry/list']:
            self.endpoint = endpoint
        else:
            raise ValueError(f"Invalid endpoint: {endpoint}")    
        
    def __process_data(self, response) -> pd.DataFrame:
        """
        Clean and store the JSON response in a DataFrame.
        
        :param response: JSON response of the API request.
        :return: Dataframe of the JSON response.
        """
        # Convert the JSON response into a dataframe.
        df = pd.json_normalize(response['data'])
        # Convert dates to datetime objects.
        for column in ['published_at']:
            df[column] = pd.to_datetime(df[column])
            # Check if datetime objects are tz-aware; if not, localize to UTC before converting.
            if df[column].dt.tz is None:
                df[column] = df[column].dt.tz_localize('UTC')
            df[column] = df[column].dt.tz_convert(None)
        # Unpack column values.
        for column in ['entities']:
            exploded_df = df.explode(column)
            df_list = []
            for idx, row in exploded_df.iterrows():
                # Check if the value is a dictionary and contains 'highlights'.
                if pd.notna(row[column]) and isinstance(row[column], dict) and 'highlights' in row[column]:
                    # Normalize the highlights columns.
                    highlights_df = pd.json_normalize(row[column], 'highlights')
                    # Create a copy of the row and normalize rest of the columns, except highlight.
                    row_data = row[column].copy()
                    del row_data['highlights']
                    row_df = pd.json_normalize(row_data)
                    # Process 'highlights'.
                    if not highlights_df.empty:
                        # Clean the text in highlights.
                        highlights_df['highlight'] = highlights_df['highlight'].str.replace("\n", " ", regex=True)
                        highlights_df['highlight'] = highlights_df['highlight'].str.replace('<.*?>', '', regex=True)
                        highlights_df['highlight'] = highlights_df['highlight'].str.replace(r"\s+", " ", regex=True)
                        highlights_df['highlight'] = highlights_df['highlight'].str.strip()
                        highlights_df = highlights_df.map(lambda x: f'"{x}"')
                        # Change column names to indicate hierarchy.
                        highlights_df.columns = ['highlights.' + col for col in highlights_df.columns]
                        # Combine the row values into a single list for each column.
                        highlights_df = {col: [highlights_df[col].tolist()] for col in highlights_df}
                        highlights_df = pd.DataFrame(highlights_df)
                        # Merge the `row_df` and `highlights_df` to form a complete row.
                        row_df = pd.concat([row_df]*len(highlights_df), ignore_index=True)
                    else:
                        highlights_df = pd.DataFrame(index=[0])
                    combined_df = pd.concat([row_df, highlights_df], axis=1)
                elif pd.notna(row[column]):
                    # Normalize non-dictionary values.
                    combined_df = pd.json_normalize(row[column] if isinstance(row[column], dict) else {})
                else:
                    # Handle NaN or None.
                    combined_df = pd.DataFrame(index=[0])
                combined_df.index = [idx] * len(combined_df)
                df_list.append(combined_df)
            # Combine all the rows into a single dataframe.
            normalized_df = pd.concat(df_list)
            normalized_df = normalized_df.map(lambda x: f'"{x}"' if isinstance(x, str) else x)
            # Group by index and aggregate lists.
            aggregated = normalized_df.groupby(by=normalized_df.index).agg(list)
            # Change column name to highlight hierarchy.
            aggregated.columns = [column + "." + col for col in aggregated.columns]
            df = df.drop(column, axis=1)
            df = df.join(aggregated)
        # Cleaning data.
        for column in ['title', 'description', 'snippet']:
            if df[column] is not None:
                # Replace newline characters with space.
                df[column] = df[column].replace("\n", " ")
                # Replace multiple spaces with a single space.
                df[column] = df[column].replace(r"\s+", " ", regex=True)
                df[column] = df[column].str.replace('<.*?>', '', regex=True)
                # Strip leading and trailing spaces.
                df[column] = df[column].str.strip()
        return df 
        
    def fetch_data(self, uuid: str=None) -> pd.DataFrame:
        """
        Fetches top news data from marketaux.com.
        
        :param uuid: UUID value if `endpoint = 'similar'` or `endpoint = 'uuid'`.
        :return: The JSON response from the API.
        """
        # Check if endpoint if 'uuid' or 'similar' and create the URL to fetch data.
        if self.endpoint in ['uuid', 'similar']:
            if uuid is not None: 
                url = self.base_url + self.endpoint + '/' + uuid
            else:
                raise ValueError("UUID is None.")
        else:
            url = self.base_url + self.endpoint        
        # Fetch data from the API with the passed parameters.
        try:
            response = requests.get(url=url, params=self.params)
            _LOG.debug(f"Fetching data from {response.request.url}")
            response.raise_for_status()
            _LOG.info("Data fetched successfully.")
            self.json_response = response.json()
        except requests.RequestException as e:
            _LOG.error(f"Error fetching data: {e}")
            raise
        df = self.__process_data(self.json_response)
        return df


### sample data

In [218]:
datapipeline_4 = marketauxPipeline()
datapipeline_4.set_query_params()
df_marketaux = datapipeline_4.fetch_data()
df_marketaux.head()

Unnamed: 0,uuid,title,description,keywords,snippet,url,image_url,language,published_at,source,relevance_score,similar,entities.symbol,entities.name,entities.exchange,entities.exchange_long,entities.country,entities.type,entities.industry,entities.match_score,entities.sentiment_score,entities.highlights.highlight,entities.highlights.sentiment,entities.highlights.highlighted_in
0,c9e1c666-7008-43bd-a5f6-8bb6153b4370,Crypto and cryptocurrency,,,Crypto and cryptocurrency Crypto and cryptocurrency startups are garnering attention in the sphere due to their rapid growth and potential for significant ret...,https://magnetmartpk.medium.com/crypto-and-cryptocurrency-2a176774c72a,https://miro.medium.com/v2/1*m-R_BkNf1Qjr1YbyOIJY2w.png,en,2024-02-14 20:37:06,medium.com,41.12303,[],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan]
1,1f6a6370-3cae-4024-b567-4ebc216948a9,What Is Staking Cryptocurrency (Crypto)?,"Learn about cryptocurrency staking, how it works, and the pros and cons of this investment.",,"Earning money while you sleep is the dream. And although there are plenty of passive income opportunities in traditional finance, a fairly new income stream th...",https://www.moneycrashers.com/staking-cryptocurrency-meaning/,https://www.moneycrashers.com/wp-content/uploads/2021/10/cropped-moneycrashers-square-logo-large-32x32.png,en,2022-08-03 19:00:00,moneycrashers.com,40.37246,[],"[""CC:ETH"", ""CC:ADA"", ""CC:XTZ"", ""CC:ALGO"", ""CC:NEAR""]","[""Ethereum"", ""Cardano"", ""Tezos"", ""Algorand"", ""NEAR Protocol""]","[nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan]","[""global"", ""global"", ""global"", ""global"", ""global""]","[""cryptocurrency"", ""cryptocurrency"", ""cryptocurrency"", ""cryptocurrency"", ""cryptocurrency""]","[""N/A"", ""N/A"", ""N/A"", ""N/A"", ""N/A""]","[81.21008, 81.96448, 80.41106, 77.365746, 78.557976]","[0.6105, 0.705, 0.7292, 0.7292, 0.7292]","[[""Most new cryptocurrencies are running on PoS-style networks, including Ethereum 2.0, which will migrate Ethereum off the old proof-of-work network. Pros & Cons of Staking Cryptocurrency Staking cryptocurrency can help create passive income for investors, but can be complicated."", ""For example; Coinbase offers staking of several crypto through staking pools, including Ethereum 2.0, Cosmo (ATOM), and Cardano (ADA). Users can earn up to 5% APY (sometimes more) on staked crypto, with rewards deposited on a different set schedule per asset."", ""Some of the most popular coins include: Ethereum 2.0 (ETH) Algorand (ALGO) Cardano (ADA) Near Protocol (NEAR) Tezos (XTZ) There are dozens of others available, and CoinMarketCap offers an extensive list of the most popular cryptocurrencies that offer staking. How Do I Get Started Staking Crypto?""], [""For example; Coinbase offers staking of several crypto through staking pools, including Ethereum 2.0, Cosmo (ATOM), and Cardano (ADA). Users can earn up to 5% APY (sometimes more) on staked crypto, with rewards deposited on a different set schedule per asset."", ""Some of the most popular coins include: Ethereum 2.0 (ETH) Algorand (ALGO) Cardano (ADA) Near Protocol (NEAR) Tezos (XTZ) There are dozens of others available, and CoinMarketCap offers an extensive list of the most popular cryptocurrencies that offer staking. How Do I Get Started Staking Crypto?""], [""Some of the most popular coins include: Ethereum 2.0 (ETH) Algorand (ALGO) Cardano (ADA) Near Protocol (NEAR) Tezos (XTZ) There are dozens of others available, and CoinMarketCap offers an extensive list of the most popular cryptocurrencies that offer staking. How Do I Get Started Staking Crypto?""], [""Some of the most popular coins include: Ethereum 2.0 (ETH) Algorand (ALGO) Cardano (ADA) Near Protocol (NEAR) Tezos (XTZ) There are dozens of others available, and CoinMarketCap offers an extensive list of the most popular cryptocurrencies that offer staking. How Do I Get Started Staking Crypto?""], [""Some of the most popular coins include: Ethereum 2.0 (ETH) Algorand (ALGO) Cardano (ADA) Near Protocol (NEAR) Tezos (XTZ) There are dozens of others available, and CoinMarketCap offers an extensive list of the most popular cryptocurrencies that offer staking. How Do I Get Started Staking Crypto?""]]","[[""0.4215"", ""0.6808"", ""0.7292""], [""0.6808"", ""0.7292""], [""0.7292""], [""0.7292""], [""0.7292""]]","[[""main_text"", ""main_text"", ""main_text""], [""main_text"", ""main_text""], [""main_text""], [""main_text""], [""main_text""]]"
2,af559e3d-2f89-4b3c-b692-4cd5a599cf0d,Crypto Statistics 2024: Cryptocurrency Facts,,,We have gathered some interesting crypto statistics and facts. This article is mainly based on a survey that the Ontario Securities Commission (OSC) embarked on...,https://www.quantifiedstrategies.com/crypto-statistics/,https://www.quantifiedstrategies.com/wp-content/uploads/2023/09/QS-logo2-500px.png,en,2024-01-28 00:13:57,quantifiedstrategies.com,39.898716,[],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan]
