In [None]:
from sqlalchemy import create_engine
import pandas as pd
import os
import json

In [None]:
# Importing environmental variables library that reads from the .env file
from dotenv import load_dotenv

# Loading key-value pairs from the .env file into the OS environment
load_dotenv()

# Reading the key-value pairs from the OS environment
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
db_hostname = os.getenv("DB_HOST")
db_name = os.getenv("DB_DATABASE")

# Using those variables in the connection string using "F" strings
conn_string = f"mysql+mysqldb://{user}:{password}@{db_hostname}/{db_name}?charset=utf8mb4"
engine = create_engine(conn_string)

In [31]:
# Show the list of columns in table 'flat_data'
with engine.connect() as conn:
    result = pd.read_sql("DESC flat_data;", conn)
    print(result)

                      Field          Type Null Key Default Extra
0   application_title_short  varchar(255)   NO        None      
1         application_title  varchar(255)   NO        None      
2                     pdfId           int   NO        None      
3                  filingId           int   NO        None      
4                      date      datetime   NO        None      
5            application_id           int   NO        None      
6                 submitter  varchar(255)   NO        None      
7                   company  varchar(255)  YES        None      
8                   tableId   varchar(36)   NO        None      
9                   pdfName  varchar(255)   NO        None      
10                     page           int   NO        None      
11               tableTitle    mediumtext   NO        None      
12            table_content          json  YES        None      
13                  eil_one    varchar(4)   NO                  
14              eil_sever

In [38]:
query = "SELECT * FROM flat_data;"

with engine.connect() as conn:
    df = pd.read_sql(query, conn)

dic = {}

for row in df.itertuples():
    # converting JSON string to a list of lists of strings
    table = json.loads(row.table_content)
    headers = table[0]  # column headers
    for header in headers:
        if header in dic:
            dic[header] += 1
        else:
            dic[header] = 1

my_list = [(key, val) for key, val in dic.items()]  # Converting to list
my_list.sort(key=lambda tup: tup[1], reverse=True)  # sorting the list

total_headers = 0
print(f"COUNT\tHEADER")
for item in my_list:
    total_headers += item[1]
    print(f"{item[1]}\t{item[0]}")
print()
print(f'Total headers: {total_headers}; unique headers: {len(my_list)}')

COUNT	HEADER
71	VEC
71	GIS
71	Topic
67	Status
66	Land Use
64	Location
47	Legal Location
44	Environmental Issues
41	Notes
40	Recommendation(s)
22	KP
21	Cover Type
20	Cover Ranking Assignment Low
20	Cover Ranking Assignment Moderate
20	Cover Ranking Assignment High
19	Current Status 2011
18	Environmental Issue/ Concern
18	Current Status 2012
17	Recommendations
16	1<s>st</s> Year Issue/ Condition
16	Legal Location (W4M)
16	Current Status 2013
15	Issue(s)
15	Current Status 2010
14	Environmental Issue(s)
13	2nd Year (2014) PCRM Assessment and Work Completed
13	3rd Year (2015) PCRM Assessment and Work Completed
13	Surface Drainage (ponding)
13	Condition/ Issue Rating
13	Legal Land Description
13	Recommended Measure(s) to Resolve Issue(s) and Schedule
13	Approximate KP
12	4th Year (2016) PCRM Assessment and Work Completed
12	Potential Adverse Environmental Effect
12	1<s>st</s> Year (2014) Issue(s)
12	3<s>rd</s> Year (2016) PCRM Assessment and Work Completed
12	Legal Location (W6M)
12	Current 