# DOWNLOAD RAW DATA
Download raw data between a time range

-------
**DATA**:
- INPUT: nothing - query datalake
- OUTPUT: "data_raw_bigquery.pkl"

## Root folder and read env variables

In [1]:
import os
# fix root path to save outputs
actual_path = os.path.abspath(os.getcwd())
list_root_path = actual_path.split('\\')[:-1]
root_path = '\\'.join(list_root_path)
os.chdir(root_path)
print('root path: ', root_path)

root path:  D:\github-mi-repo\Optimization-Industrial-Process


In [2]:
import os
from dotenv import load_dotenv, find_dotenv # package used in jupyter notebook to read the variables in file .env

""" get env variable from .env """
load_dotenv(find_dotenv())

""" Read env variables and save it as python variable """
PROJECT_GCP = os.environ.get("PROJECT_GCP", "")

""" Get dataset and table of bigquery where the data is located """
URI_TABLE_DATALAKE = os.environ.get("URI_TABLE_DATALAKE", "")

## RUN

In [3]:
import pandas as pd
import numpy as np
from google.cloud import bigquery
import gcsfs
import pickle
import json

In [4]:
### desarrollo

PROJECT_ID = PROJECT_GCP
! gcloud config set project $PROJECT_ID

Updated property [core/project].


### 1. Read parameters to read data

In [5]:
# read json parameters
json_params = 'config/params.json'
with open(json_params, 'r') as file:
    params = json.load(file)

In [6]:
# get start and end date train
start_date = params['blanqueo_santafe_all']['data_train']['start_date_train']
end_date = params['blanqueo_santafe_all']['data_train']['end_date_train']

print('start_date: ',start_date)
print('end_date: ', end_date)

start_date:  2021-01-01
end_date:  2023-01-01


### 2. Read table master tags
Where is located a list of all features to get from bq

In [7]:
path_master_tags = 'config/MaestroTagsGlobal.xlsx'
master_tags = pd.read_excel(path_master_tags)
master_tags.head()

Unnamed: 0,TAG,TAG_DESCRIPTION,DESCRIPCION,ETAPA,CLASIFICACION,var_calc
0,240FI020A.PNT,prod_total,Producción Total,A,NC,
1,calc_prod_d0,calc_prod_d0,Producción entrada D0 (prod entrada A dezplazada),D0,NC,True
2,240FI020B.PNT,prod_eop,Prod entrada EOP,EOP,NC,
3,240FI108A.PNT,prod_bypass,Producción by pass,D1,NC,
4,calc_prod_d1,calc_prod_d1,Producción entrada D1 (prod entrada A desplazada),D1,NC,True


In [8]:
# filter list ALL features to download data
list_features = master_tags['TAG'].tolist()

### 3. Download data from datalake where the data is located

In [9]:
# define query
query = f'''
SELECT DISTINCT
tag_name AS Tag,
tag_value AS PV,
CAST( CONCAT(tag_date, 'T', tag_tm) AS DATETIME ) AS datetime
FROM `{URI_TABLE_DATALAKE}`
where tag_date >= "{start_date}"
and tag_date <= "{end_date}"
and tag_name in {tuple(list_features)}
ORDER BY datetime asc
'''

In [10]:
# query datalake
def query_bq_to_dataframe(Project,InputQuery):
    '''
    Query a table of bq and return a dataframe with the output
    '''
    client = bigquery.Client(project = Project)
    query_job = client.query(InputQuery)
    results = query_job.result()
    results = results.to_dataframe()
    return results

data = query_bq_to_dataframe(PROJECT_ID, query)
data

Unnamed: 0,Tag,PV,datetime
0,240FI020A_HRS_EOP.C,46.999200,2021-01-01 00:00:00
1,240FI020A_HRS_DO.C,11.483770,2021-01-01 00:00:00
2,230AIT446.PNT,11.556540,2021-01-01 00:00:00
3,240FIC440.MEAS,0.023512,2021-01-01 00:00:00
4,240FIC236.MEAS,0.000117,2021-01-01 00:00:00
...,...,...,...
13352524,240FI020A_HRS_DO.C,10.446980,2023-01-01 23:55:02
13352525,S276PER002,11.210000,2023-01-01 23:55:02
13352526,S2MAQUINAT07,3.977046,2023-01-01 23:55:02
13352527,240FIC110.MEAS,0.091055,2023-01-01 23:55:02


In [11]:
# save locally
with open('artifacts/data/data_raw_bigquery.pkl', 'wb') as handle:
    pickle.dump(data, handle)