In [147]:
import requests
from bs4 import BeautifulSoup
import calendar
import pandas as pd
import math
from re import search
import random
import time

#--functions------------------------------------------------------------------------------------

def build_empty_df_for_month(m_int,y_int):
    """initialize dataframe to store hourly event data for a month"""
    num_days = calendar.monthrange(y_int,m_int)[1] #gives total number of days in the month

    df_m = pd.DataFrame(columns=pd.Series(range(0,24))) #24 columns for each hr
    
    dayMonth_strings=[]
    for ii in range(1,num_days+1): #loop over days in month
        #dayString=month_day_yearStr(month_int,ii,year)
        new_index=month_day_yearStr(m_int,ii,y_int) #string of the form Jan1.2019
        #df_m=df_m.append( [new_index] ) #add index (row) to df and 24 columns for hrs
        dayMonth_strings.append(new_index)
    df_m = pd.DataFrame(index=dayMonth_strings,columns=pd.Series(range(0,24))) #24 columns for each hr
    return df_m

def conv_ampm_to_twentyFourHrString(string1):
    """Converts 12hr time to 24hr time, e.g. 11:00pm-->23:00,11:00am-->11:00"""
    conv_t=0
    if( "am" in string1  ): #am
        conv_t=string1[0:len(string1)-2]
    elif( "pm" in string1  ): #pm
        starthr=int(string1.split(':')[0])
        if(starthr==12):
            conv_t=string1.strip('pm') #grab 12:00pm as 12:00
        else:
            conv_t=str(starthr+12)+":"+string1.split(':')[1]#+12hr for any other pm time
            conv_t=conv_t.strip('pm')
    else:  #
        raise ValueError('am or pm time not provided to conv_to_twentyFourHr')
    return conv_t

def fill_df_forMonth_wEvents(m_int,y_int):
    """return df of events for a month"""
    df_fill = build_empty_df_for_month(m_int,y_int)#empty df for month
    num=calendar.monthrange(y_int,m_int)[1] #gives total number of days in the month
    
    for d_int in range(1,num+1): #loop over days for month
        
        fill_events=webscrape_forexFactory_page(m_int,d_int,y_int) #events for a day
        md_string = month_day_yearStr(m_int,d_int,y_int) #month day string   
        if( fill_events is not None ):
            for ff in range(0,len(fill_events.columns)): #parse the events and fill df
                fill_time = str.split(fill_events.iloc[0][ff],':')[0]
                fill_hr = math.floor( int(fill_time) )
                fill_curr = fill_events.iloc[1][ff]  
                if( pd.isna(df_fill.at[md_string,fill_hr]) ):
                    df_fill.loc[md_string,fill_hr]=fill_curr #
                elif( (fill_curr in df_fill.loc[md_string,fill_hr])==False ) : #dont have currency recorded yet
                    df_fill.loc[md_string,fill_hr]=df_fill.loc[md_string,fill_hr]+','+fill_curr #append currency event
            time.sleep( random.uniform(0.21,3.18) ) # random sleep to not overwhelm with requests
            
    return df_fill

def monthInt_toStr(m_int):
    """write string function to convert integer to month string for webpage"""
    m_string = ""
    if( isinstance(m_int,int) and m_int>=1 and m_int<=12 ):# of integer type
        m_table = pd.Series( {1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'June',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'} )
        m_string = m_table[m_int][:]# three-letter month string        
    else:
        raise ValueError("month must be an integer of value 1-12")
    return m_string 

def month_day_yearStr(month_int,day_int,year_int):
    """make string in format e.g Jan5.2020"""
    mdy_str = monthInt_toStr(month_int)+str(day_int)+ '.' +str(year_int)
    return mdy_str

def webscrape_forexFactory_page(websc_month,websc_day,websc_year):
    """Scrape www.forexfactory.com/calendar for currency events for a particular day"""
    
    #--requests header
    accept='text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8'
    acceptEncoding='gzip, deflate, br'
    acceptLanguage='en-US, en; q=0.5'
    #cookie offset to 0 for GMT time
    cookie='ffsessionhash=2632135f2eb04c765c611ae5d04a5ff3; ffverifytimes=1; fflastactivity=0; fftab-history=calendar; ffsettingshash=c1cc05f4bc2c72fba054db520e06352e; fflastvisit=1585371911; fftimeformat=0; ffdstonoff=1; fftimezoneoffset=-6; fftimezoneoffset=0; ffadon=1; __gads=ID=301618e0d651417d:T=1585371912:S=ALNI_MbN44mE7AAy1nrXhmvqaG7M_7vzig; _gid=GA1.2.69885615.1585371918; _ga=GA1.2.1633801945.1585371918; _gat_gtag_UA_3311429_1=1'
    #old cookie='ffverifytimes=1; fflastactivity=0; fftab-history=calendar%2Cforums; ffsettingshash=c1cc05f4bc2c72fba054db520e06352e; fflastvisit=1585371911; ffdstonoff=1; fftimezoneoffset=0; ffadon=1; __gads=ID=301618e0d651417d:T=1585371912:S=ALNI_MbN44mE7AAy1nrXhmvqaG7M_7vzig; _gid=GA1.2.69885615.1585371918; _ga=GA1.2.1633801945.1585371918'
    useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363'
    host='www.forexfactory.com'
    connection='close'#'Keep-Alive'
    upgrade='1'
    head = {'User-Agent': useragent,'Host':host,'Connection':connection,'Accept-Encoding':acceptEncoding,'Accept-Language':acceptLanguage,'Accept':accept,'Cookie':cookie,'Upgrade-Insecure-Requests':upgrade}

    websc = monthInt_toStr(websc_month)+str(websc_day)+ '.' +str(websc_year) #build string for url
    #old webstring="https://www.forexfactory.com/calendar.php?day="+websc
    webstring="https://www.forexfactory.com/calendar?day="+websc
    print("webstring=",webstring)
    
    page = requests.get(webstring,head) #scrape page
    content = page.content
    soup = BeautifulSoup(content,"html.parser")
    table = soup.find_all("tr",{"class":"calendar_row"})
    #s=requests.Session()
    #s.max_redirects = 60
    #s.headers = {'User-Agent': 'My App'}
    #websc = monthInt_toStr(websc_month)+str(websc_day)+ '.' +str(websc_year)
    #webstring="https://www.forexfactory.com/calendar.php?day="+websc
    #print("webstring=",webstring)
    #page = s.get(webstring) #page scraping
    #content = page.content
    #soup = BeautifulSoup(content,"html.parser")
    #table = soup.find_all("tr",{"class":"calendar_row"})

    curr = []
    times = []
    timeString=""
    for item in table:
        currency=item.find_all("td", {"class":"calendar__currency"})[0].text #currency
        currency=currency.strip('\n') #removes both front and back \n 
        currency=currency.replace(" ","")
        time_entry=item.find_all("td", {"class":"calendar__time"})[0].text #time (may have more than one entry)
        #print("time_entry=",time_entry)
        #print("currency=",currency)
        #print("len(currency)=",len(currency))
        #print("len(time_entry)=",len(time_entry))
        if( time_entry=='All Day' or ('Day' in time_entry)==True or ('Data' in time_entry)==True ): #Day 1 for e.g. elections
            time_entry='00:00' #All day events at start of day
        elif( len(time_entry)==0 and len(currency)>0 ):  #empty string for time
            time_entry=timeString #previous time
        elif( len(time_entry)>0 and len(currency)>0 ):
            time_entry=conv_ampm_to_twentyFourHrString(time_entry)
            timeString=time_entry
        else:
            return #returns none, no events for that day
        curr.append(currency)
        times.append(time_entry)  
    #return pd.DataFrame([times,curr]).transpose()
    return pd.DataFrame([times,curr])

#-------------------------------------------------------------------


#page = requests.get("https://www.forexfactory.com/calendar.php?day=Mar22.2020",head) #sample page
#content = page.content
#soup = BeautifulSoup(content,"html.parser")
#table = soup.find_all("tr",{"class":"calendar_row"})
#time=99
#for item in table:
#    currency=item.find_all("td", {"class":"calendar__currency"})[0].text #currency
#    time_entry=item.find_all("td", {"class":"calendar__time"})[0].text #time (may have more than one entry)
#    print( "currency="+currency ) #Currency
#    print( "time_entry=",time_entry ) #Time Eastern

Append some data data and keep appending

In [44]:
df_events_log = pd.DataFrame()
a=webscrape_forexFactory_page(5,6,2019)

webstring= https://www.forexfactory.com/calendar?day=May6.2019


In [45]:
df_events_log.append(a)
print(df_events_log.append(a))

      0      1     2     3     4     5     6     7     8      9     10     11  \
0  3:00  00:00  3:15  3:45  3:50  3:55  4:00  4:30  5:00  13:45  14:00  18:30   
1   EUR    GBP   EUR   EUR   EUR   EUR   EUR   EUR   EUR    CAD    USD    AUD   

      12     13     14     15  
0  20:30  21:30  21:30  23:00  
1    JPY    AUD    AUD    NZD  


In [4]:
b=webscrape_forexFactory_page(5,6,2019)

In [5]:
df_events_log=df_events_log.append(b)

In [12]:
print(df_events_log)

      0      1     2     3     4     5     6     7     8      9     10     11  \
0  3:00  00:00  3:15  3:45  3:50  3:55  4:00  4:30  5:00  13:45  14:00  18:30   
1   EUR    GBP   EUR   EUR   EUR   EUR   EUR   EUR   EUR    CAD    USD    AUD   

      12     13     14     15  
0  20:30  21:30  21:30  23:00  
1    JPY    AUD    AUD    NZD  


Save this data to '.csv'

In [6]:
df_events_log.to_csv('df_events_log.csv',header=False,index=False)

Looping for May, 2019

In [148]:
df_test = fill_df_forMonth_wEvents(2,2019)

webstring= https://www.forexfactory.com/calendar?day=Feb1.2019
webstring= https://www.forexfactory.com/calendar?day=Feb2.2019
webstring= https://www.forexfactory.com/calendar?day=Feb3.2019
webstring= https://www.forexfactory.com/calendar?day=Feb4.2019
webstring= https://www.forexfactory.com/calendar?day=Feb5.2019
webstring= https://www.forexfactory.com/calendar?day=Feb6.2019
webstring= https://www.forexfactory.com/calendar?day=Feb7.2019
webstring= https://www.forexfactory.com/calendar?day=Feb8.2019
webstring= https://www.forexfactory.com/calendar?day=Feb9.2019
webstring= https://www.forexfactory.com/calendar?day=Feb10.2019
webstring= https://www.forexfactory.com/calendar?day=Feb11.2019
webstring= https://www.forexfactory.com/calendar?day=Feb12.2019
webstring= https://www.forexfactory.com/calendar?day=Feb13.2019
webstring= https://www.forexfactory.com/calendar?day=Feb14.2019
webstring= https://www.forexfactory.com/calendar?day=Feb15.2019
webstring= https://www.forexfactory.com/calendar?

In [149]:
#write data to csv
df_test.to_csv('Feb.2019.csv',header=True,index=True)

In [134]:
#read data back in
myDir='C:/Users/cx5313nj/OneDrive - MNSCU/Desktop/coding2020/webscraping/'
fullPath = myDir+'Apr.2019.csv'
df_read=pd.read_csv(fullPath,index_col=0)
#print(df_read)
print(df_read.loc['May1.2019'][0])

CHF,EUR,USD


In [150]:
print(df_test)

                 0    1        2        3        4        5        6    7   \
Feb1.2019       USD  AUD      CHF  CHF,EUR  EUR,CHF  EUR,GBP      EUR  NaN   
Feb2.2019       NaN  NaN      NaN      NaN      NaN      NaN      NaN  NaN   
Feb3.2019       CNY  NaN      NaN      NaN      NaN      NaN      NaN  NaN   
Feb4.2019       CNY  NaN      NaN      NaN      EUR  EUR,GBP      EUR  NaN   
Feb5.2019   NZD,CNY  NaN      NaN      NaN      EUR  EUR,GBP      EUR  NaN   
Feb6.2019       CNY  NaN      NaN      EUR      NaN      NaN      NaN  NaN   
Feb7.2019       CNY  JPY      NaN      EUR  CHF,GBP      EUR      EUR  NaN   
Feb8.2019       NaN  JPY      CHF      EUR      NaN      EUR      NaN  NaN   
Feb9.2019       NaN  NaN      NaN      NaN      NaN      NaN      NaN  NaN   
Feb10.2019      JPY  NaN      NaN      NaN      NaN      NaN      NaN  NaN   
Feb11.2019      EUR  NaN      NaN      CHF      NaN      GBP      NaN  NaN   
Feb12.2019      EUR  JPY      NaN      NaN      EUR      NaN    

In [146]:
for ii in range(1,30):
    print(ii)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
