In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz 
import os
import json
from pathlib import Path
import re

In [2]:
# Importing environmental variables library that reads from the .env file
from dotenv import load_dotenv

# Loading key-value pairs from the .env file into the OS environment
load_dotenv()

# Reading the key-value pairs from the OS environment
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
db_hostname = os.getenv("DB_HOST")
db_name = os.getenv("DB_DATABASE")

# Using those variables in the connection string using "F" strings
conn_string = f"mysql+mysqldb://{user}:{password}@{db_hostname}/{db_name}?charset=utf8mb4"
engine = create_engine(conn_string)

In [194]:
query = "SELECT distinct i.tableId, i.rowIndex , i.vec_pri, i.vec_sec, i.issue_pri, i.issue_sec, w.word2vec_vec FROM issues i LEFT JOIN issues_parsed ip ON i.tableId = ip.tableId AND i.rowIndex = ip.rowIndex LEFT JOIN sub_vecs s ON i.tableId = s.tableId AND i.rowIndex = s.rowIndex LEFT JOIN word2vec w ON i.tableId = w.tableId AND i.rowIndex = w.rowIndex WHERE s.sub_vec IS NULL AND i.status is not null;"
with engine.connect() as conn:
    df = pd.read_sql(query, conn)
df['word2vec_vec'].fillna('', inplace = True)
aggr_vec_df = df.groupby(['tableId','rowIndex'], as_index = False).agg({'word2vec_vec': ', '.join}) #https://stackoverflow.com/questions/27298178/concatenate-strings-from-several-rows-using-pandas-groupby
final_issues_df = pd.merge(aggr_vec_df, df, how = 'left', on = ['tableId', 'rowIndex'])
final_issues_df = final_issues_df.drop_duplicates(subset = ['tableId', 'rowIndex'], keep = 'last').reset_index(drop = True)
del final_issues_df['word2vec_vec_y']
final_issues_df.rename(columns={'word2vec_vec_x':'VECs'}, inplace=True)
print(len(final_issues_df))
#final_issues_df.to_csv('issues.csv', encoding = 'utf-8-sig')

1263


In [195]:
query = "SELECT distinct ip.tableId, ip.rowIndex , ip.rowCounter, ip.issue_parsed, w.word2vec_vec FROM issues_parsed ip LEFT JOIN sub_vecs s ON ip.tableId = s.tableId AND ip.rowIndex = s.rowIndex AND ip.rowCounter = s.rowCounter LEFT JOIN word2vec w ON ip.tableId = w.tableId AND ip.rowIndex = w.rowIndex AND ip.rowCounter = w.rowCounter WHERE s.sub_vec IS NULL;"
with engine.connect() as conn:
    df = pd.read_sql(query, conn)
df['word2vec_vec'].fillna('', inplace = True)
aggr_vec_df = df.groupby(['tableId','rowIndex', 'rowCounter'], as_index = False).agg({'word2vec_vec': ', '.join}) #https://stackoverflow.com/questions/27298178/concatenate-strings-from-several-rows-using-pandas-groupby
final_issues_df = pd.merge(aggr_vec_df, df, how = 'left', on = ['tableId', 'rowIndex', 'rowCounter'])
final_issues_df = final_issues_df.drop_duplicates(subset = ['tableId', 'rowIndex', 'rowCounter'], keep = 'last').reset_index(drop = True)
del final_issues_df['word2vec_vec_y']
final_issues_df.rename(columns={'word2vec_vec_x':'VECs'}, inplace=True)
print(len(final_issues_df))
#final_issues_df.to_csv('issues_parsed.csv', encoding = 'utf-8-sig')

5390


In [253]:
df1 = pd.read_csv('issue_parsed_clean.csv', encoding = 'cp1252')
print(df1.columns, len(df1))

Index(['tableId', 'rowIndex', 'count', 'status_txt', 'issues'], dtype='object') 74


In [259]:
df2 = pd.read_csv('v_new_parsed_issues.csv', encoding = 'utf-8-sig')
df2 = df2.loc[~df2['tableId'].isin(["3e9e6cdb-f812-4832-b69c-b8ec0396d585", "44a33e5f-d99e-48ef-ad56-bbb516ec8796", "bfafbfd0-8bb5-4283-8f5e-dd7cbcec480c"])]
print(df2.columns, len(df2))

Index(['tableId', 'rowIndex', 'status_bin', 'status_txt', 'count', 'issues'], dtype='object') 9650


In [255]:
total_len = len(df1) + len(df2)
total_len

9724

In [256]:
df2

Unnamed: 0,tableId,rowIndex,status_bin,status_txt,count,issues
0,02db9f91-572a-44af-9858-4add101353c1,1,,"June 22, 2016 – Intermittent weeds observed, r...",1,"June 22, 2016 – Intermittent weeds observed, r..."
1,02db9f91-572a-44af-9858-4add101353c1,1,,"June 22, 2016 – Intermittent weeds observed, r...",2,"July 13, 2016 - Vegetation continues to establ..."
2,02db9f91-572a-44af-9858-4add101353c1,1,,"June 22, 2016 – Intermittent weeds observed, r...",3,"July 13, 2016 - Cross drains are vegetated an..."
3,02db9f91-572a-44af-9858-4add101353c1,1,,"June 22, 2016 – Intermittent weeds observed, r...",4,"July 13, 2016 - Wetlands stable and consiste..."
4,02db9f91-572a-44af-9858-4add101353c1,1,,"June 22, 2016 – Intermittent weeds observed, r...",5,"July 13, 2016 - No erosion or subsidence issues."
...,...,...,...,...,...,...
9786,fdb3d057-943a-4fab-99ac-1f4eed471512,55,,"June 28, 2017 – Intermittent perennial sow thi...",2,"Aug 18, 2017 - Vegetation established at 65-80..."
9787,fdb3d057-943a-4fab-99ac-1f4eed471512,55,,"June 28, 2017 – Intermittent perennial sow thi...",3,"Aug 18, 2017 – Intermittent perennial sow thi..."
9788,fdb3d057-943a-4fab-99ac-1f4eed471512,55,,"June 28, 2017 – Intermittent perennial sow thi...",4,"Aug 23, 2017 – Intermittent herbicide applicat..."
9789,fdb3d057-943a-4fab-99ac-1f4eed471512,56,,,1,.


In [260]:
df = pd.concat([df1, df2])

In [261]:
df.to_csv('issue_parsed_clean1.csv', index = False, encoding = 'utf-8-sig')

In [None]:
def clean_dataframe(dataframe, column_name):
    dataframe[column_name] = dataframe[column_name].str.lower()
    pattern = '|'.join(['<s>', '</s>',])
    dataframe[column_name] = dataframe[column_name].str.replace(pattern, '')
    dataframe[column_name] = dataframe[column_name].str.replace('\(s\)', 's') #reference: https://stackoverflow.com/questions/51440233/how-to-remove-the-values-which-are-in-parentheses-in-pandashttps://stackoverflow.com/questions/51440233/how-to-remove-the-values-which-are-in-parentheses-in-pandas
    return dataframe    

In [None]:
query = "SELECT * FROM issues WHERE tableId = '3e1c53b4-5c01-46e2-bd72-5a338b5852f9';"

with engine.connect() as conn:
    df = pd.read_sql(query, conn)
    #clean_dataframe(df, 'table_content')
dic = {}

for row in df.itertuples():
    # converting JSON string to a list of lists of strings
    table = json.loads(row.table_content)
    headers = table[0]  # column headers  
    for header in headers:
        #header = lemmawordnet(header)
        #header = clean_header(header)
        #header = lemmaspacy(header)
        #header = re.sub(r'\(.*?\)', lambda x: ''.join(x.group(0).split()), header) # removing whitespace between parentheses (reference: https://stackoverflow.com/questions/34088489/how-to-remove-whitespace-inside-brackethttps://stackoverflow.com/questions/34088489/how-to-remove-whitespace-inside-bracket)
        header = " ".join(header.split())
        if header in dic:
            dic[header] += 1
        else:
            dic[header] = 1

my_list = [(header, count) for header, count in dic.items()]  # Converting to list
my_list.sort(key=lambda tup: tup[1], reverse=True)  # sorting the list

total_headers = 0
print(f"COUNT\tHEADER")
for item in my_list:
    total_headers += item[1]
    print(f"{item[1]}\t{item[0]}")
print()
print(f'Total headers: {total_headers}; unique headers: {len(my_list)}')

In [None]:
df3 = pd.DataFrame(my_list)
df3.to_csv('headers4.csv', encoding = 'utf-8-sig', index = False)

In [None]:
for row in df.itertuples():
    table = json.loads(row.table_content)
    print(type(table))
    lst_element1 = "SPREAD"
    lst_element2 = "QUARTER SECTION"
    if lst_element1 in table[0][0] and lst_element2 in table[0][1]:
        print(row.tableId)

In [None]:
for row in df.itertuples():
    table = json.loads(row.table_content)
    if table[0][len(table[0])-1] == "Topic":
        data = pd.DataFrame(table)
        data.to_csv(r"C:\Users\t1nipun\Desktop\PCMR\human-robot\Data_Analysis\csvs\\" + row.tableId + '.csv', encoding = 'utf-8-sig', index = False, header = None)
    else:
        table[0].extend(['VEC', 'GIS', 'Topic'])
        for i in range(1, len(table)-1):
            table[i].extend(['','',''])
        data = pd.DataFrame(table)
        data.to_csv(r"C:\Users\t1nipun\Desktop\PCMR\human-robot\Data_Analysis\csvs\\" + row.tableId + '.csv', encoding = 'utf-8-sig', index = False, header = None)

In [None]:
!pip install opencv-python
!pip install camelot-py[all] 
# Please install ghostcript using this link: https://www.ghostscript.com/
import camelot
import tkinter
#tables = camelot.read_pdf('G:/Post Construction/george.pdf', pages = '7', line_scale=40, flag_size=True, copy_text=['v'],) # latice method
tables = camelot.read_pdf('G:/Post Construction/george.pdf', flavor = 'stream', edge_tol=500, pages = '6-38') # stream method
tables.export('scheduleA.csv', f='csv', compress=True)

In [None]:
tables.export('scheduleA.csv', f='csv', compress=True) # json, excel, html
tables[0]

In [None]:
tables[0].parsing_report

In [None]:
tables[0].to_csv('george1.csv')
tables[0].df

In [3]:
query = "SELECT * FROM pcmr.word2vec;"
with engine.connect() as conn:
    df = pd.read_sql(query, conn)
df.to_csv('word2vec.csv', encoding = 'utf-8-sig')
df

Unnamed: 0,vec_id,tableId,rowIndex,rowCounter,word2vec_vec
0,1,0114daa6-c048-4081-b4cf-7a9d7461fa44,1,,water
1,2,0114daa6-c048-4081-b4cf-7a9d7461fa44,1,,physical
2,3,0114daa6-c048-4081-b4cf-7a9d7461fa44,2,,vegetation
3,4,0114daa6-c048-4081-b4cf-7a9d7461fa44,3,,physical
4,5,0114daa6-c048-4081-b4cf-7a9d7461fa44,4,,vegetation
...,...,...,...,...,...
20285,20286,fdb3d057-943a-4fab-99ac-1f4eed471512,55,1.0,vegetation
20286,20287,fdb3d057-943a-4fab-99ac-1f4eed471512,55,2.0,vegetation
20287,20288,fdb3d057-943a-4fab-99ac-1f4eed471512,55,2.0,physical
20288,20289,fdb3d057-943a-4fab-99ac-1f4eed471512,56,1.0,
