This is a code test.

In [1]:
import pandas as pd
import numpy as np
import sqlalchemy
import psycopg2
import datetime 
import matplotlib.pyplot as pl
from psycopg2 import sql

from sql_lib import create_conn_and_cur

In [2]:
database_name='US_ELEC'
fname='ELEC'
table_name=fname
engine=sqlalchemy.create_engine("postgresql+psycopg2://localhost/"+database_name)

conn=psycopg2.connect(dbname=database_name,host='localhost')
conn.set_session(autocommit=True)
cur = conn.cursor()


The following defines some useful functions for grabbing SQL queries, and loading the desired columns into a Pandas DataFrame.

In [3]:
#make safe SQL queries, with deired list of columns in "out_columns".
#Assume we are searching through name for entries with desired type of series, for particular states,
#as well as generation type.
def safe_sql_query(table_name,
                   out_columns,
                   series_type='Net generation', 
                   state='Oregon',
                   gen_type='all',
                   freq='M'):
    #make up categories to match the name by. 
    query_match_list=list()
    l1 = sql.Literal(series_type+' :%')
    l2 = sql.Literal('%: '+state+' :%')
    l3 = sql.Literal('%'+gen_type+'%')
    #join together these matches with ANDs to match them all
    like_query=sql.SQL(' AND name LIKE ').join([l1,l2,l3])

    #Total SQL query to select desired columns with features 
    q1 = sql.SQL("SELECT {} FROM {} WHERE (name LIKE {} AND f LIKE {}) ").format(
                sql.SQL(' ,').join(map(sql.Identifier,out_columns)),
                sql.Identifier(table_name),
                like_query,
                sql.Literal(freq))

    return(q1)

#Get a dataframe from SQL database for given psycopg2 cursor,
#with desired output columns.     
#Must select data based on series type, state, and type of generation.
def get_dataframe(cur,
                  out_columns,
                  table="ELEC",
                  series_type='Net Generation',
                  state='Oregon',
                  gen_type='solar',
                  freq='M'):
    q = safe_sql_query(table,out_columns,series_type,state,gen_type,freq)
    cur.execute(q);
    df0=cur.fetchall();
    df = pd.DataFrame(df0,columns=out_columns);
    return df


In [4]:
# q=safe_sql_query(table_name="ELEC",out_columns=('name','series_id'),gen_type='solar')
# print(q.as_string(conn))
# cur.execute(q)

#Try to select data from fields with names such as net generation and desired state.
#Possibly also by electricity source.  Nuclear/Solar/Wind/Gas/Coal etc.  

#Useful fields: Net Generation : state : type
#Also interesting: Average retail price of electricity.  
#Net generation
#Retail sales of electricity
#Revenue

#Can Identify useful tags by splitting at colons":"
out_col=('name','data','start','end','f')
df=get_dataframe(cur,out_col,series_type='Net generation',state='Oregon',gen_type='solar');


In [5]:
#Need to convert dates to date/time
def make_timeindex(df):
    #get date_time_index based on period, start and end
    #Make a Pandas Series with DateTimeIndex.
    #Use Start, End, with final label from geoset_id.
    #Use Final label from Geoset_id which should have values in M,Q,A
    interval=df['f']

    if (interval=='M'):
        start_str=df['start']
        end_str=df['end']
        date_format='%Y%m'
    elif (interval=='Q'):
        start_str=quarter_to_month(df['start'])
        end_str=quarter_to_month(df['end'])
        date_format='%Y%m'
    elif (interval=='A'):
        start_str=df['start']
        end_str=df['end']
        date_format='%Y'

    start_date=pd.to_datetime(start_str,format=date_format)
    end_date=pd.to_datetime(end_str,format=date_format)
    date_indx=pd.date_range(start_date,end_date,freq=interval)

    return date_indx

def quarter_to_month(YearQ):
    #assume input string is of format YYYYQn
    #remake string with starting month
    q_start=(int(YearQ[-1])-1)*3+1
    #Check only Quarters1-4 allowed
    if (q_start > 4 | q_start<0):
        print('Quarter is outside range:'+str(q_start))
    #convert quarterly string to starting month of that quarter
    YearM=YearQ[0:4]+str(q_start).zfill(2) 
    return YearM

#Make a list of lists, with first sublist entry as time, second sublist entry is data
#into a pandas timeseries.    Extract the interval from the geoset ID, and use to construct 
#the Period Index.
# def make_df_datetimeindex(series,series_start,series_stop,interval):
#   #make empty series
#   series2=np.asarray(eval(series));
#   dat2=series2[:,1];
#   s=pd.Series(dat2,index=make_timeindex(df))
#   return s

In [23]:
#Initial readin of SQL dataframes returns 'data' as a string of a list of lists.  
#This function goes row by row, converting that 'data' column
#into a new series, with datetimeindex in 'data2'

def convert_df(df):
    Nrows=len(df)
# df['data2']=pd.Series()
    data_array=[];
    for i in range(0,Nrows):
        #check there's actually data there.
        print('Making',i,'dataset')
        #use next line since the read in dataframe has returned a string.
        init_series=np.asarray(eval(df.loc[i,'data']))
        dat2=init_series[:,1];
        timeindex=make_timeindex(df.loc[i])
        if (len(dat2) != len(timeindex)):
            print('Unequal lengths')
            print(len(dat2),len(timeindex))
        else:
            s=pd.Series(dat2,index=timeindex)
            data_array.append(s)

        return data_array

    
# Make a Period Index - really, really easy.
#But plotting is limited with "Periods".  It seems only
#"DateTimeIndices" allow easy combinations.  
def convert_df_period(df):
    Nrows=len(df)
# df['data2']=pd.Series()
    data_array=[];
    for i in range(0,Nrows):
        #check there's actually data there.
        print('Making',i,'dataset')
        #use next line since the read in dataframe has returned a string.
        init_series=np.asarray(eval(df.loc[i,'data']))
        dat2=init_series[:,1];
        f = df.loc[i,'f']
        periodindex=pd.PeriodIndex(init_series[:,0],freq=f)
        s=pd.Series(dat2,index=periodindex)
        data_array.append(s)

    return data_array

In [17]:

s0=np.asarray(eval(df.loc[0,'data']))
i0=pd.PeriodIndex(s0[:,0],freq=df.loc[0,'f'])

In [24]:
d0=convert_df_period(df)

Making 0 dataset
Making 1 dataset
Making 2 dataset
Making 3 dataset
Making 4 dataset
Making 5 dataset
Making 6 dataset
Making 7 dataset
Making 8 dataset
Making 9 dataset
Making 10 dataset


In [None]:
df.loc[0].shape