In [2]:
import requests # this is a really useful library for pulling data from the web
import csv # this helps us work with csv files
import numpy as np # numpy is something like a matlab replacement for python.  Numeric and scientific computing.
import pandas as pd # we'll learn more about this soon
import datetime # helps us to work with dates and times in different formats
import os # helps us talk to the operating system command line
from calendar import monthrange

In [3]:
def CAISOrenewables(start_year, start_month, start_day, end_year, end_month, end_day, production = False, matrix = False):
    """Scrape CAISO's daily renewable watch .txt files and 
    convert to a DataFrame or Numpy record array. Will only scrape
    a range of days in a given month.
    
    Keyword arguments:
    Year -- year of the date to scrape
    Month -- Month of date to scrape
    start_day -- starting day of month to scrape
    end_day -- ending day to scrape
    production -- If False, will collect hourly breakdown of renewable resources.
                  If True, will scrape hourly breakdown of total production by resource type.
    matrix -- If False, function will return a Pandas DataFrame
              If True, will return numpy recarray
    """
    base_url = 'http://content.caiso.com/green/renewrpt/'
    tail = '_DailyRenewablesWatch.txt'
    
    rv = pd.DataFrame()
    
    for year in range(start_year, end_year + 1):
        if year == start_year:
            s_month = start_month
        else: 
            s_month = 1
            
        if year == end_year:
            e_month = end_month
        else:
            e_month = 12
            
        for month in range(s_month, e_month + 1):
            if month < 10:
                str_month = '0' + str(month)
            else:
                str_month = str(month)

            if month == start_month and year == start_year:
                s_day = start_day
            else:
                s_day = 1
            if month == end_month and year == end_year:
                e_day = end_day
            else:
                e_day = monthrange(year, month)[1]
            for day in range(s_day, e_day + 1):
                #format date and URL to pull

                if day < 10:
                    str_day = '0'+ str(day)
                else:
                    str_day = str(day)

                str_m_day = str_month + str_day
                url = base_url + str(year) + str_m_day + tail

                #Write scraped file to drive
                caiso_data = requests.get(url).text
                txt_filename = 'CAISOdata/' + str(year) +str_m_day + '.txt'
                csv_filename = 'CAISOdata/' + str(year) + str_m_day + '.csv'

                with open(txt_filename, 'w') as f:
                    f.write(str(caiso_data))

                #Convert the .txt file to a csv.
                with open(txt_filename) as txtfile, open(csv_filename,'w') as new_csv:
                    for line in txtfile: 
                        new_csv.write(line.replace('\t',','))

                #Get day of year for dataframe index
                date = datetime.date(year, month, day)

                #Load data to dataframe.
                data = pd.read_csv(csv_filename, delimiter='\t')

                if not production:
                    data = data.iloc[range(0, 25)]
                else:
                    data = data.iloc[range(28, 53)].reset_index(drop=True)

                #Get column names
                columns = [i for i in np.array2string(data.iloc[0].values).split(',') if len(i)>3]

                #Grab first row of data to put in a dictionary then append the rest.
                first_row = [[int(i)] for i in np.array2string(data.iloc[1].values).split(',') if i.isdigit()]
                df_data = dict(zip(columns, first_row))

                #Do the same for the rest of the rows
                for row in range(2, data.shape[0]):
                    vals = [i for i in np.array2string(data.iloc[row].values).split(',') if i.isdigit()]
                    for item in range(len(columns)):
                        df_data[columns[item]].append(vals[item])
                
                #create DataFrame with collected data
                d_df = pd.DataFrame(df_data, [datetime.datetime(year, month, day, i) for i in range(0,24)])[columns]

                rv = rv.append(d_df)
                
                os.remove(txt_filename)
                os.remove(csv_filename)


    rv.drop('Hour', axis = 1, inplace = True)            
    if matrix:        
        return rv.to_records(index=True)
    
    return rv

In [4]:
caiso_data = CAISOrenewables(2017, 8, 29, 2018, 8, 28)
caiso_data.head()

Unnamed: 0,GEOTHERMAL,BIOMASS,BIOGAS,SMALL HYDRO,WIND TOTAL,SOLAR PV,SOLAR THERMAL
2017-08-29 00:00:00,1181,340,156,324,1551,0,0
2017-08-29 01:00:00,1182,338,156,326,1556,0,0
2017-08-29 02:00:00,1183,337,156,337,1325,0,0
2017-08-29 03:00:00,1185,339,156,313,1158,0,0
2017-08-29 04:00:00,1190,344,156,320,1209,0,0
