In [286]:
from sqlalchemy import create_engine
import pandas as pd
import os
import json
from pathlib import Path
import spacy
import en_core_web_sm
import nltk
import re
nlp = en_core_web_sm.load()
nlp.max_length = 6000000 

In [277]:
# Importing environmental variables library that reads from the .env file
from dotenv import load_dotenv

# Loading key-value pairs from the .env file into the OS environment
load_dotenv()

# Reading the key-value pairs from the OS environment
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
db_hostname = os.getenv("DB_HOST")
db_name = os.getenv("DB_DATABASE")

# Using those variables in the connection string using "F" strings
conn_string = f"mysql+mysqldb://{user}:{password}@{db_hostname}/{db_name}?charset=utf8mb4"
engine = create_engine(conn_string)

In [289]:
def lemmaspacy(my_new_str):
    my_new_str = re.sub(' +', ' ', my_new_str)
    doc = nlp(my_new_str)
    return " ".join([token.lemma_ for token in doc]) # joining all the word tokens after lemmatizer implementation

In [290]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmawordnet(my_new_str):
    sentence_words = nltk.word_tokenize(my_new_str)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word, pos='n') for word in sentence_words]) 
    return lemmatized_output

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\t1nipun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [291]:
def clean_dataframe(dataframe, column_name):
    dataframe[column_name] = dataframe[column_name].str.lower()
    pattern = '|'.join(['<s>', '</s>',])
    dataframe[column_name] = dataframe[column_name].str.replace(pattern, '')
    dataframe[column_name] = dataframe[column_name].str.replace('\(s\)', 's') #reference: https://stackoverflow.com/questions/51440233/how-to-remove-the-values-which-are-in-parentheses-in-pandashttps://stackoverflow.com/questions/51440233/how-to-remove-the-values-which-are-in-parentheses-in-pandas
    return dataframe    

In [295]:
query = "SELECT * FROM flat_data WHERE rating_coding = '';"

with engine.connect() as conn:
    df = pd.read_sql(query, conn)
    #clean_dataframe(df, 'table_content')
dic = {}

for row in df.itertuples():
    # converting JSON string to a list of lists of strings
    table = json.loads(row.table_content)
    headers = table[0]  # column headers  
    for header in headers:
        #header = lemmawordnet(header)
        header = lemmaspacy(header)
        header = re.sub(r'\(.*?\)', lambda x: ''.join(x.group(0).split()), header) # removing whitespace between parentheses (reference: https://stackoverflow.com/questions/34088489/how-to-remove-whitespace-inside-brackethttps://stackoverflow.com/questions/34088489/how-to-remove-whitespace-inside-bracket)
        if header in dic:
            dic[header] += 1
        else:
            dic[header] = 1

my_list = [(header, count) for header, count in dic.items()]  # Converting to list
my_list.sort(key=lambda tup: tup[1], reverse=True)  # sorting the list

total_headers = 0
print(f"COUNT\tHEADER")
for item in my_list:
    total_headers += item[1]
    print(f"{item[1]}\t{item[0]}")
print()
print(f'Total headers: {total_headers}; unique headers: {len(my_list)}')

COUNT	HEADER
73	status
72	land use
71	vec
71	gis
71	topic
67	location
61	environmental issue
59	recommendation
47	legal location
46	note
23	2nd year (2014) pcrm assessment and work complete
22	kp
20	issue
19	1st year issue/ condition
19	current status 2011
18	environmental issue/ concern
18	current status 2012
17	1st year (2013) issue
16	legal location (w4m)
16	current status 2013
15	current status 2010
15	recommend measure to resolve issue and schedule
14	3rd year (2015) pcrm assessment and work complete
14	1st year (2014) issue
13	4th year (2016) pcrm assessment and work complete
13	potential adverse environmental effect
13	3rd year (2016) pcrm assessment and work complete
13	legal land description
13	legal location (w6m)
13	approximate kp
13	alignment sheet
12	outstanding issue
12	observation in 2010
12	current status 2014
11	issue / condition
11	5th year (2017) pcrm assessment and work complete
11	2015 pcrm assessment and work complete
10	propose schedule
10	biophysical element
9	p

In [None]:
for row in df.itertuples():
    table = json.loads(row.table_content)
    print(type(table))
    lst_element1 = "SPREAD"
    lst_element2 = "QUARTER SECTION"
    if lst_element1 in table[0][0] and lst_element2 in table[0][1]:
        print(row.tableId)

In [299]:
for row in df.head(7).itertuples():
    table = json.loads(row.table_content)
    if table[0][len(table[0])-1] == "Topic":
        # data = pd.DataFrame(table)
        # data.to_csv(r"C:\Users\t1nipun\Desktop\PCMR\human-robot\Data_Analysis\csvs\\" + row.tableId + '.csv', encoding = 'utf-8-sig', index = False, header = None)
    else:
        table[0].extend(['VEC', 'GIS', 'Topic'])
        for i in range(1, len(table)-1):
            table[i].extend(['','',''])
        # data = pd.DataFrame(table)
        # data.to_csv(r"C:\Users\t1nipun\Desktop\PCMR\human-robot\Data_Analysis\csvs\\" + row.tableId + '.csv', encoding = 'utf-8-sig', index = False, header = None)