In [1]:
from __future__ import print_function

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google.oauth2.credentials import Credentials
from azure.storage.blob import BlobServiceClient

import pandas as pd
import os.path
import json
from tqdm import tqdm
pd.set_option('display.max_rows', 500)

In [3]:
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
root_dir =  os.path.abspath(os.path.join(os.getcwd(), "../../.."))
tmp_dir = root_dir + "/.tmp"
cred_dir = root_dir + "/.cred"
data_dir = root_dir + "/data"
site_summary_filename = data_dir + "hourly-site-summary.csv"

## Establish Azure Connection

In [4]:
creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists(cred_dir+'/token.json'):
    creds = Credentials.from_authorized_user_file(cred_dir+'/token.json', SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file(
            cred_dir+'/credentials.json', SCOPES)
        creds = flow.run_local_server(port=8001)
    # Save the credentials for the next run
    with open(cred_dir+'/token.json', 'w') as token:
        token.write(creds.to_json())

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=264407607006-2bcpshkj1odgpg3f6p0nnmqm4vlhhf0u.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8001%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.readonly&state=YLUm0ZAWS8txf8GxNpPvfoJfhNyecS&access_type=offline


In [18]:
uploadToAzBlob = True

if uploadToAzBlob:
    azBlobCredFile = cred_dir+'/azblobcred.json'
    if os.path.exists(azBlobCredFile):
        connect_str = ""
        with open(azBlobCredFile, "rb") as f:
            data = json.load(f)
            connect_str = data['connectionstr']
        blob_service_client = BlobServiceClient.from_connection_string(connect_str)
        container_name = "full-half-hourly-raw"
    else:
        print("ERROR: Missing Azure Storage Blob access tokens")
        
    if not (os.path.exists(tmp_dir)):
        os.mkdir(tmp_dir)

In [21]:
# Build the Drive API client

file_df = pd.DataFrame(columns = ["filename", "size", "country", "site_id"])

targetFolderName = 'data_full_half_hourly'
queryPageSize = 512
queryFields = "nextPageToken, files(id, name, size)"

try:
    service = build('drive', 'v3', credentials=creds)
    queryFolder = f"mimeType='application/vnd.google-apps.folder' and name='{targetFolderName}'"
    results = service.files().list(q=queryFolder, pageSize=10, fields="nextPageToken, files(id, name)").execute()
    folders = results.get('files', [])

    if not folders:
        print('No folder found.')
    else:
        folder = folders[0]
        print(f'{folder["name"]}:')
        folder_id = folder["id"]
        query = f"'{folder_id}' in parents and trashed = false"
        results = service.files().list(q=query, pageSize=queryPageSize, fields=queryFields).execute()
        files = results.get('files', [])
        token = results.get('nextPageToken', None)
        
        fileCount = 0
        
        if files:
            for file in files:
                fileName = file["name"]
                fileId   = file["id"]
                fileSize = file["size"]
                station = fileName.split('_')[-1].split(".")[0]
                country = station.split('-')[0]

                print(f'{fileCount+1:3}. {fileName} ({fileSize:>10})')
                
                fileMetadata = {
                    "filename": fileName,
                    "size": fileSize,
                    "country": country,
                    "site_id": station
                }
                file_df=file_df.append(fileMetadata, ignore_index=True)
                fileCount += 1
                
                if(uploadToAzBlob):
                    # Download file from Google Drive to local .tmp drive
                    local_filename = tmp_dir + "/" + fileName
                    if not (os.path.exists(local_filename)):
                        file_data = service.files().get(fileId=fileId, fields='*').execute()
                        request = service.files().get_media(fileId=fileId)
                        file_handle = request.execute()
                        with open(local_filename, "wb") as f:
                            f.write(file_handle)
                
                    # Create a blob client using the local file name as the name for the blob
                    blob_client = blob_service_client.get_blob_client(container=container_name, blob=fileName)
                    with open(local_filename, "rb") as data:
                        blob_client.upload_blob(data)
                    print(f'\t\'{fileName}\' uploaded.')
                    
            ## Get Next Page Content (still buggy)
            #results = service.files().list(q=query, pageToken=token, fields=queryFields).execute()
            #files = results.get('files', [])
            #token = results.get('nextPageToken', None)
            
except HttpError as error:
    print(F'An error occurred: {error}')
    print(F'{error.resp.status}, {error.resp.reason}')
    
file_df.dropna(inplace=True)

data_full_half_hourly:
  1. data_full_half_hourly_raw_v0_1_CN-HaM.csv (  21307369)


  file_df=file_df.append(fileMetadata, ignore_index=True)


ServiceResponseError: ('Connection aborted.', TimeoutError('The write operation timed out'))

In [None]:
os.listdir(tmp_dir)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling/.tmp'

In [None]:
local_filename = tmp_dir + '/data_full_half_hourly_raw_v0_1_AU-How.csv'
df = pd.read_csv(local_filename)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling/.tmp/data_full_half_hourly_raw_v0_1_AU-How.csv'

## Generate Hourly Dataset

In [8]:
## TEMP ##
# Set paths
wd = '/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling/'
data = wd + 'data/datasets/'
hh_data = data + 'half_hourly/'
code = wd + 'code/src/'

In [41]:
# Loop through directory, loading HH site data
sites = [x for x in os.listdir(hh_data) if 'data_full_half_hourly' in x]

for site in tqdm(sites):
    site_hh = pd.read_csv(hh_data + site)

    # Subset records to only those with TIME_START ending in 00
    site_hh['TIMESTAMP_START'] = site_hh['TIMESTAMP_START'].astype(str)
    site_hr = site_hh.loc[site_hh['TIMESTAMP_START'].str[-2:]=='00', ]

    # Write out to blob storage

100%|██████████| 7/7 [00:02<00:00,  2.75it/s]


In [43]:
site_hh.head()

Unnamed: 0,TIMESTAMP_START,TIMESTAMP_END,TA_F,TA_F_QC,TA_ERA,SW_IN_POT,SW_IN_F,SW_IN_F_QC,SW_IN_ERA,LW_IN_F,...,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen
0,200705250000,200705250030,19.191,0,16.063,0.0,0.0,2,0.0,382.254,...,0.27022,0.0367,0.3364,0.0225,0.0648,0.31,0.1848,0.0751,DBF,Cold
1,200705250030,200705250100,18.453,0,15.127,0.0,0.0,2,0.0,382.254,...,0.27022,0.0367,0.3364,0.0225,0.0648,0.31,0.1848,0.0751,DBF,Cold
2,200705250100,200705250130,17.292,0,14.192,0.0,0.0,2,0.0,373.377,...,0.27022,0.0367,0.3364,0.0225,0.0648,0.31,0.1848,0.0751,DBF,Cold
3,200705250130,200705250200,14.702,0,13.561,0.0,0.0,2,0.0,373.377,...,0.27022,0.0367,0.3364,0.0225,0.0648,0.31,0.1848,0.0751,DBF,Cold
4,200705250200,200705250230,13.206,0,12.931,0.0,0.0,2,0.0,373.377,...,0.27022,0.0367,0.3364,0.0225,0.0648,0.31,0.1848,0.0751,DBF,Cold


## Logic for 2-Hourly Data (if desired)

In [60]:
# Split date
def parse_timestamp(timestamp):
    year = timestamp[0:4]
    month = timestamp[4:6]
    day = timestamp[6:8]
    hour = timestamp[8:10]
    mins = timestamp[10:12]
    return year, month, day, hour, mins
site_hh[['START_YEAR', 'START_MONTH', 'START_DAY', 'START_HOUR', 'START_MINS']] = \
    site_hh['TIMESTAMP_START'].astype(str).apply(parse_timestamp).apply(pd.Series)

# Subset to the even hour start times
site_hh['START_HOUR'] = site_hh['START_HOUR'].astype(int)
site_2hr = site_hh.loc[(site_hh['START_MINS']=='00') & (site_hh['START_HOUR']%2==0), ]

Unnamed: 0,TIMESTAMP_START,TIMESTAMP_END,TA_F,TA_F_QC,TA_ERA,SW_IN_POT,SW_IN_F,SW_IN_F_QC,SW_IN_ERA,LW_IN_F,...,b7,IGBP,koppen,START_YEAR,START_MONTH,START_DAY,START_HALFHOUR,START_TIME,START_HOUR,START_MINS
0,200705250000,200705250030,19.191,0,16.063,0.0,0.0,2,0.0,382.254,...,0.0751,DBF,Cold,2007,5,25,0,0,0,0
1,200705250030,200705250100,18.453,0,15.127,0.0,0.0,2,0.0,382.254,...,0.0751,DBF,Cold,2007,5,25,30,30,0,30
2,200705250100,200705250130,17.292,0,14.192,0.0,0.0,2,0.0,373.377,...,0.0751,DBF,Cold,2007,5,25,100,100,1,0
3,200705250130,200705250200,14.702,0,13.561,0.0,0.0,2,0.0,373.377,...,0.0751,DBF,Cold,2007,5,25,130,130,1,30
4,200705250200,200705250230,13.206,0,12.931,0.0,0.0,2,0.0,373.377,...,0.0751,DBF,Cold,2007,5,25,200,200,2,0
