In [1]:
import string
import re
import pandas as pd
import os

from utils import basic
from db.get_db_data import GetTableData
from db.config import Config
_Config = Config()

# setup db connection
conn_cur_list = basic.setup_db_connection()
query_path = str(_Config.queries["get_sta_tables"])
query_string = basic.read_query_file(query_path)

# create 
_GetTableData = GetTableData(conn_cur_list[0], conn_cur_list[1])
sta_tables_df = _GetTableData.create_pandas_table(query_string)

print(sta_tables_df.head())

                       table_name
0   sta_euro_dollar_exchange_rate
1                sta_copper_price
2              sta_usa_bond_2year
3  sta_1_year_treasury_rate_yield
4        sta_capacity_utilization


In [2]:
#test_table = sta_tables_df.iloc[1]

In [3]:
query_path = str(_Config.queries["select_table_sample"])
query_string = basic.read_query_file(query_path)
query_string

'select * from __tablename__ order by RANDOM() limit __limit__;'

In [4]:
def get_sta_dateformats(sta_table):
    query = query_string.replace("__tablename__", sta_table)
    query = query.replace("__limit__", "1000")
    df = _GetTableData.create_pandas_table(query)

    # determine the date formats for all table sample cells
    df['datetype'] = df.iloc[:,0].apply(lambda x: basic.string_to_sql_type(str(x)))
    
    # return the most presented date format
    major_dateformat = df['datetype'].value_counts(ascending=False).to_frame().index.values[0]

    return major_dateformat

In [5]:
sta_tables_df["datetype"] = sta_tables_df['table_name'].apply(lambda x: get_sta_dateformats(x))

In [6]:
sta_tables_df

Unnamed: 0,table_name,datetype
0,sta_euro_dollar_exchange_rate,date__M/D/YYYY
1,sta_copper_price,date__M/D/YYYY
2,sta_usa_bond_2year,"date__Mon DD, YYYY"
3,sta_1_year_treasury_rate_yield,date__M/D/YYYY
4,sta_capacity_utilization,date__M/D/YYYY
5,sta_futures_vix,"date__Mon DD, YYYY"
6,sta_heating_oil_prices,date__M/D/YYYY
7,sta_industrial_production,date__M/D/YYYY
8,sta_palladium_prices,date__M/D/YYYY
9,sta_sp500,date__YYYY-MM-DD


In [27]:
# retrieve one M/D/YYYY table as pd dataframe

from db.get_dbtable_data import get_dbtable_data

test_dm_table = get_dbtable_data("sta_sugar_prices")
test_dm_table

Unnamed: 0,date,value
0,11/29/1962,0.0389
1,11/30/1962,0.0384
2,12/3/1962,0.0387
3,12/4/1962,0.038
4,12/5/1962,0.0375
...,...,...
14531,12/28/2020,
14532,12/29/2020,
14533,12/30/2020,
14534,12/31/2020,


In [28]:
# new data frame with split value columns 
new = test_dm_table.iloc[:,0].str.split("/", n = 3, expand = True) 
new



Unnamed: 0,0,1,2
0,11,29,1962
1,11,30,1962
2,12,3,1962
3,12,4,1962
4,12,5,1962
...,...,...,...
14531,12,28,2020
14532,12,29,2020
14533,12,30,2020
14534,12,31,2020


In [29]:
# making separate month column from new data frame 
test_dm_table["month"] = new[0] 
  
# making separate day column from new data frame 
test_dm_table["day"] = new[1] 

# making year column from new data frame 
test_dm_table["year"] = new[2] 
test_dm_table

Unnamed: 0,date,value,month,day,year
0,11/29/1962,0.0389,11,29,1962
1,11/30/1962,0.0384,11,30,1962
2,12/3/1962,0.0387,12,3,1962
3,12/4/1962,0.038,12,4,1962
4,12/5/1962,0.0375,12,5,1962
...,...,...,...,...,...
14531,12/28/2020,,12,28,2020
14532,12/29/2020,,12,29,2020
14533,12/30/2020,,12,30,2020
14534,12/31/2020,,12,31,2020


In [30]:
def fill_datecol_with_zeros(day_o_month):
    if len(day_o_month) == 1:
        day_o_month = '0' + day_o_month
    return day_o_month

In [31]:
test_dm_table['month_o'] = test_dm_table['month'].apply(lambda x: fill_datecol_with_zeros(x))
test_dm_table['day_o'] = test_dm_table['day'].apply(lambda x: fill_datecol_with_zeros(x))

test_dm_table

Unnamed: 0,date,value,month,day,year,month_o,day_o
0,11/29/1962,0.0389,11,29,1962,11,29
1,11/30/1962,0.0384,11,30,1962,11,30
2,12/3/1962,0.0387,12,3,1962,12,03
3,12/4/1962,0.038,12,4,1962,12,04
4,12/5/1962,0.0375,12,5,1962,12,05
...,...,...,...,...,...,...,...
14531,12/28/2020,,12,28,2020,12,28
14532,12/29/2020,,12,29,2020,12,29
14533,12/30/2020,,12,30,2020,12,30
14534,12/31/2020,,12,31,2020,12,31


In [32]:
test_dm_table['date'] = test_dm_table['month_o'] + '/' + test_dm_table['day_o'] + '/' + test_dm_table['year']
test_dm_table.drop(['month', 'month_o', 'day_o', 'year', 'day'], axis=1, inplace=True)
test_dm_table

Unnamed: 0,date,value
0,11/29/1962,0.0389
1,11/30/1962,0.0384
2,12/03/1962,0.0387
3,12/04/1962,0.038
4,12/05/1962,0.0375
...,...,...
14531,12/28/2020,
14532,12/29/2020,
14533,12/30/2020,
14534,12/31/2020,
