In [1]:
import json
import pandas as pd
import requests
import sqlite3 as db
from sqlite3 import Error

%run C:\Users\Joshua\Jupyter_Notebook_Folders\APIkeys.py

### Functions for working with SQLite

In [13]:
def create_database(db_name):
    """ 
    create an SQLite database
    """
    conn = None
    try:
        conn = db.connect(db_name)
        #print(db.version)
    except Error as e:
        print(e)
    finally:
        conn.close()
            
def create_connection(db_name):
    """ 
    create a database connection to the SQLite database
        specified by db_name
    return: Connection object or None
    """
    conn = None
    try:
        conn = db.connect(db_name)
        return conn
    except Error as e:
        print(e)

    return conn


def check_table_exists(tbl_name, db_name): 
    """
    Check if table exists
    """
    conn = create_connection(db_name)
    c = conn.cursor()
    exists = 0
    c.execute("SELECT count(name) FROM sqlite_master WHERE type='table'")
    if c.fetchone()[0] == 0:
        print("No tables in DB.")
    else:
        try:
            c.execute("SELECT COUNT(name) FROM sqlite_master WHERE type = 'table' AND name='" + seriesID+"';")
        except Error as e:
            print(e)
        if c.fetchone()[0] == 1:
            exists = 1
        
    conn.close()
    
    return exists 
    
    
def create_table(input_df, tbl_name, db_name):
    """
    create table
    """
    conn = create_connection(db_name)
    c = conn.cursor()
    if not check_table_exists(tbl_name ,db_name):
        try: 
            input_df.to_sql(tbl_name, conn)
            print("Created ", tbl_name, " table.")
        except ValueError as e:
            print(e)
            print("Table was not added to the DB")       
    conn.close()
    
    
def validate_df(df, tbl_name, db_name):
    """
    check if the data from the api df already exists in the table and 
    if the data in the table is duplicate or inconsistent
    return dataframe df without duplicate data or inconsistent data
    return dataframe df_dup_data with duplicate date
    return dataframe df_inconsistent_data with inconsistent data
    """
    conn = create_connection(db_name)
    c = conn.cursor()
    
    df_not_dup_data = pd.DataFrame(columns = list(df.columns))
    df_dup_data = pd.DataFrame(columns = list(df.columns))
    df_inconsistent_data = pd.DataFrame(columns = list(df.columns))
    
    if check_table_exists(seriesID, db_name) != 0:
        # Check if data already exists in DB
        for i, row in df.iterrows():
            c.execute("SELECT date, value FROM " + tbl_name + " WHERE date = ?;", (df.loc[i]["date"],))
            queryOut = c.fetchall()
            if len(queryOut) == 0:
                df_not_dup_data = pd.concat([df_not_dup_data, df.loc[[i]]])
            elif queryOut[0][1] == df.loc[i]["value"]:
                df_dup_data = pd.concat([df_dup_data, df.loc[[i]]])
                #print(seriesID, " data for ", df.iloc[i]["date"], " already exists with same value as API:", queryOut[0][1])
            elif queryOut[0][1] != df.loc[i]["value"]:
                df_inconsistent_data = pd.concat([df_inconsistent_data, df.loc[[i]]])
                #print(seriesID, " ", df.ioc[i]["date"], "DB DATA: ", queryOut[0][1], " DOES NOT MATCH INPUT DATA: ", )
    else:
        df_not_dup_data = df.copy()
    
    conn.close()
    
    df_not_dup_data = df_not_dup_data.sort_values(by=['date'])
    
    return df_not_dup_data, df_dup_data, df_inconsistent_data


def add_to_database(df_EntryData, tbl_name, db_name, append_replace):  
    """
    Add dataframe to 
    database: db_name 
    table: tbl_name
    """
    conn = create_connection(db_name)    
    try:
        df_EntryData.to_sql(seriesID, con=conn, if_exists = append_replace)
        print("Data added to DB: \n", df_EntryData)
    except ValueError as e:
        print(e)
        print("Data was not added to the DB")
    conn.close()
    
    
def get_max_index(tbl_name, db_name):
    '''
    Get the maximum index value of the specified tbl_name
    '''
    conn = create_connection(db_name)
    c = conn.cursor()
    try:
        c.execute("SELECT MAX([index]) FROM " + seriesID)
        max_index = c.fetchone()[0]
        return max_index
    except Error as e:
        print(e)
    finally:
        conn.close()
        
def get_max_date(tbl_name, db_name):
    '''
    Get the maximum date from the specified tbl_name
    '''
    conn = create_connection(db_name)
    c = conn.cursor()
    try:
        c.execute("SELECT date FROM " + seriesID + " WHERE [index] = (SELECT MAX([index]) FROM " + seriesID +")")
        max_date = c.fetchone()[0]
        return max_date
    except Error as e:
        print(e)
    finally:
        conn.close()

def table_to_df(tbl_name, db_name):
    conn = db.connect(db_name)
    try:
        df_fromDB = pd.read_sql_query("SELECT * FROM " + tbl_name + ";", conn, index_col="index")
    except Error as e:
        print(e)
    conn.close()
    
    return df_fromDB

### Get data from BLS

In [3]:
def get_BLS_data(seriesID, startyear, endyear):
    base_url = 'https://api.bls.gov/publicAPI/v2/timeseries/data/'  #this will not change
    headers = {'Content-type': 'application/json'}  #This will not changed !

    # For the key seriesid enter a list of series names you wish to download
    # For the key startyear enter the start year inside ""
    # For the key endyear enter the end year inside ""
    
    parameters = {
        "seriesid":[seriesID], 
        "startyear":str(startyear), 
        "endyear":str(endyear),
        "catalog":True, 
        "calculations":False, 
        "annualaverage":False,
        "aspects":False,
        "registrationkey":os.environ['BLS_API_key'] 
     }

    data = json.dumps(parameters) # Converts the Python dictionary to JSON

    p = requests.post(base_url, data=data, headers=headers)
    json_data = json.loads(p.text)
    
    message = ""
    if json_data['message']:
        message = "For series " + seriesID + ", no data for years: "
        for i in range(len(json_data['message'])):
            message += json_data['message'][i][-4:] + ", "
    
    return message, json_data 

In [4]:
# show all the rows in the dataframe only use this for testing
pd.set_option('display.max_rows', None)

### Create and Format Dataframe

In [5]:
def format_df_from_json(json_data):
    # Create dataframe from JSON
    df = pd.DataFrame(json_data['Results']['series'][0]['data'])

    # Format date column and sort by date
    df['date'] = df['year'] + df['period']
    df['date'] = df['date'].str.replace('M', '-')
    df['date'] = df['date'].astype('string')
    df = df.sort_values(by=['date'], ignore_index=True)

    # Set the index using the value from the max index in the db table
    df["index"] = range(1, len(df.index) +1 )
    df = df.set_index(df["index"])

    df = df.drop(columns=['footnotes', 'index'])
    df['value'] = df['value'].astype(float)
    df['description'] = json_data['Results']['series'][0]['catalog']['survey_name'] + ', ' + json_data['Results']['series'][0]['catalog']['series_title']
    
    return df

In [6]:
def align_df_db_indexes(df, seriesID, db_name):
    # Concat the data from the database table and the dataframe, then sort by date
    df_db = table_to_df(seriesID, db_name)
    df = pd.concat([df, df_db])
    df = df.sort_values(by=['date'], ignore_index=True)
    df = df.drop(columns=['index'])
    return df

In [7]:
seriesID = "CUUR0000SA0"
db_name = "MacroData.sqlite3"
message, json_data = get_BLS_data(seriesID, 1913, 1932)
if message:
    print(message)

In [8]:
df = format_df_from_json(json_data)
df

Unnamed: 0_level_0,year,period,periodName,value,date,description
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1913,M01,January,9.8,1913-01,"CPI for All Urban Consumers (CPI-U), All items..."
2,1913,M02,February,9.8,1913-02,"CPI for All Urban Consumers (CPI-U), All items..."
3,1913,M03,March,9.8,1913-03,"CPI for All Urban Consumers (CPI-U), All items..."
4,1913,M04,April,9.8,1913-04,"CPI for All Urban Consumers (CPI-U), All items..."
5,1913,M05,May,9.7,1913-05,"CPI for All Urban Consumers (CPI-U), All items..."
6,1913,M06,June,9.8,1913-06,"CPI for All Urban Consumers (CPI-U), All items..."
7,1913,M07,July,9.9,1913-07,"CPI for All Urban Consumers (CPI-U), All items..."
8,1913,M08,August,9.9,1913-08,"CPI for All Urban Consumers (CPI-U), All items..."
9,1913,M09,September,10.0,1913-09,"CPI for All Urban Consumers (CPI-U), All items..."
10,1913,M10,October,10.0,1913-10,"CPI for All Urban Consumers (CPI-U), All items..."


In [9]:
df, df_dup_data, df_inconsistent_data = validate_df(df, seriesID, db_name)
df

No tables in DB.


Unnamed: 0_level_0,year,period,periodName,value,date,description
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1913,M01,January,9.8,1913-01,"CPI for All Urban Consumers (CPI-U), All items..."
2,1913,M02,February,9.8,1913-02,"CPI for All Urban Consumers (CPI-U), All items..."
3,1913,M03,March,9.8,1913-03,"CPI for All Urban Consumers (CPI-U), All items..."
4,1913,M04,April,9.8,1913-04,"CPI for All Urban Consumers (CPI-U), All items..."
5,1913,M05,May,9.7,1913-05,"CPI for All Urban Consumers (CPI-U), All items..."
6,1913,M06,June,9.8,1913-06,"CPI for All Urban Consumers (CPI-U), All items..."
7,1913,M07,July,9.9,1913-07,"CPI for All Urban Consumers (CPI-U), All items..."
8,1913,M08,August,9.9,1913-08,"CPI for All Urban Consumers (CPI-U), All items..."
9,1913,M09,September,10.0,1913-09,"CPI for All Urban Consumers (CPI-U), All items..."
10,1913,M10,October,10.0,1913-10,"CPI for All Urban Consumers (CPI-U), All items..."


In [11]:
# Deal with duplicate data
if not df_dup_data.empty:
    show_dups = input("Do you want to see the duplicate data? Y/N").upper()
    if show_dups == "Y":
        print(df_dup_data)
else:
    del df_dup_data
    
# Deal with inconsistent data
if not df_inconsistent_data.empty:
    show_incons = input("Do you want to see the inconsistent data? Y/N").upper()
    if show_incons == "Y":
        print(df_inconsistent_data)
else:
    del df_inconsistent_data
    
if check_table_exists(seriesID, db_name) != 0:
    df = align_df_db_indexes(df, seriesID, db_name)
    add_to_database(df, seriesID, db_name, 'replace')
else:
    add_to_database(df, seriesID, db_name, 'append')
#df

No tables in DB.
Data added to DB: 
        year period periodName  value     date  \
index                                           
1      1913    M01    January    9.8  1913-01   
2      1913    M02   February    9.8  1913-02   
3      1913    M03      March    9.8  1913-03   
4      1913    M04      April    9.8  1913-04   
5      1913    M05        May    9.7  1913-05   
6      1913    M06       June    9.8  1913-06   
7      1913    M07       July    9.9  1913-07   
8      1913    M08     August    9.9  1913-08   
9      1913    M09  September   10.0  1913-09   
10     1913    M10    October   10.0  1913-10   
11     1913    M11   November   10.1  1913-11   
12     1913    M12   December   10.0  1913-12   
13     1914    M01    January   10.0  1914-01   
14     1914    M02   February    9.9  1914-02   
15     1914    M03      March    9.9  1914-03   
16     1914    M04      April    9.8  1914-04   
17     1914    M05        May    9.9  1914-05   
18     1914    M06       June   

In [13]:
#add_to_database(df, seriesID, db_name, 'replace')

In [14]:
df_cpi_not_seasonal = table_to_df(seriesID, db_name)
df_cpi_not_seasonal

Unnamed: 0_level_0,year,period,periodName,value,date,description
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1913,M01,January,9.8,1913-01,"CPI for All Urban Consumers (CPI-U), All items..."
2,1913,M02,February,9.8,1913-02,"CPI for All Urban Consumers (CPI-U), All items..."
3,1913,M03,March,9.8,1913-03,"CPI for All Urban Consumers (CPI-U), All items..."
4,1913,M04,April,9.8,1913-04,"CPI for All Urban Consumers (CPI-U), All items..."
5,1913,M05,May,9.7,1913-05,"CPI for All Urban Consumers (CPI-U), All items..."
6,1913,M06,June,9.8,1913-06,"CPI for All Urban Consumers (CPI-U), All items..."
7,1913,M07,July,9.9,1913-07,"CPI for All Urban Consumers (CPI-U), All items..."
8,1913,M08,August,9.9,1913-08,"CPI for All Urban Consumers (CPI-U), All items..."
9,1913,M09,September,10.0,1913-09,"CPI for All Urban Consumers (CPI-U), All items..."
10,1913,M10,October,10.0,1913-10,"CPI for All Urban Consumers (CPI-U), All items..."


In [15]:
con = db.connect('MacroData.sqlite3')
c = con.cursor()
c.execute("DROP TABLE " + seriesID)
con.commit()
con.close()

In [None]:
    # Set the index using the value from the max index in the db table
    df["index"] = range(db_index_start, db_index_start + len(df.index))
    df = df.set_index(df["index"])
