# Y Charts Scraper

## Project Description:

The below code seeks to scrape 20 years of historical shares outstanding data from 2000 - 2019 from https://ycharts.com/. The data will be leveraged in a stock data dashboard.

In [1]:
# Import packages
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup as bs
import requests
from time import sleep
from ipynb.fs.full.Preprocessing_Functions import *

In [2]:
# Import a list of stock tickers
wil_df_fin = pd.read_csv('Data/Wilshire_5000_All_Holdings.csv')

# Check data Summary
print('\n')
wil_df_fin.info(null_counts = True)
wil_df_fin.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3222 entries, 0 to 3221
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Ticker  3222 non-null   object
 1   Name    3222 non-null   object
dtypes: object(2)
memory usage: 50.5+ KB


Unnamed: 0,Ticker,Name
0,A,Agilent Technologies Inc.
1,AA,Alcoa Corp
2,AAL,American Airlines Group Inc
3,AAME,Atlantic American Corp.
4,AAN,Aarons Company Inc (The)


In [3]:
# Set headers
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
           'origin': 'https://ycharts.com',
           'referer': 'https://ycharts.com/login?next=/dashboard/'}

# Set login data
login_data = {'username': 'appdata4563626@gmail.com',
              'password': 'BigRed1993!'}

# Set login session
session = requests.Session()
url = 'https://ycharts.com/login?next=/dashboard/'
r = session.get(url)
soup = bs(r.content, 'html.parser')
login_data['csrfmiddlewaretoken'] = soup.find('input', attrs = {'name': 'csrfmiddlewaretoken'})['value']
r = session.post(url, data = login_data, headers = headers)

print(r)

<Response [200]>


## Shares Outstanding

In [23]:
# Set ticker list
ticker_list = list(wil_df_fin['Ticker'])
# Set df list of different stock tickers
df_list = []

# Scrape for stock data
for ticker in log_progress(ticker_list):
    url = 'https://ycharts.com/companies/{}/average_shares_outs_diluted_annual'.format(ticker)
    s = session.get(url) 
    soup = bs(s.text, 'html.parser') 

    data = soup.select('td')
    data = [x.get_text().strip() for x in data]
    data_date = data[0:42]
    data = data[1:42]

    # Returns every second element
    def altElement(a):
        return a[::2]

    # Split list of data into two separate by returning every second element of data in lists
    data_date = altElement(data_date)
    data = altElement(data)

    # Keeps the last four characters of string data
    date_list = []
    for date in data_date:
        date = date[-4:]
        date_list.append(date)

    # Creates column of specific ticker
    ticker_list = []
    for tickers in date_list:
        ticker_list.append(ticker)
    
    # Create data frame
    df = pd.DataFrame(list(zip(ticker_list, date_list, data)), columns = ['ticker', 'date', 'shares_outstanding'])
    # Append data frame to df list
    df_list.append(df)
    
    # Sleep between iterations for two seconds
    #sleep(2)
    
# Concatenate stock data            
shares_outstanding_df = pd.concat(df_list)     

# Print data summary
print('\n')
shares_outstanding_df.info()
shares_outstanding_df.head()

VBox(children=(HTML(value=''), IntProgress(value=0, max=3222)))



<class 'pandas.core.frame.DataFrame'>
Int64Index: 61238 entries, 0 to 20
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ticker              61238 non-null  object
 1   date                61238 non-null  object
 2   shares_outstanding  61238 non-null  object
dtypes: object(3)
memory usage: 1.9+ MB


Unnamed: 0,ticker,date,shares_outstanding
0,A,2020,312.00M
1,A,2019,318.00M
2,A,2018,325.00M
3,A,2017,326.00M
4,A,2016,329.00M


In [24]:
# Filter out date 2020
shares_outstanding_df = shares_outstanding_df.loc[shares_outstanding_df['date'] != '2020']

# Print data summary
print('\n')
shares_outstanding_df.info()
shares_outstanding_df.head()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 58693 entries, 1 to 20
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ticker              58693 non-null  object
 1   date                58693 non-null  object
 2   shares_outstanding  58693 non-null  object
dtypes: object(3)
memory usage: 1.8+ MB


Unnamed: 0,ticker,date,shares_outstanding
1,A,2019,318.00M
2,A,2018,325.00M
3,A,2017,326.00M
4,A,2016,329.00M
5,A,2015,335.00M


In [25]:
# Export data to project directory
shares_outstanding_df.to_csv('Data/Shares_Outstanding.csv')