In [478]:
from sqlalchemy import create_engine
import pandas as pd
import os
import re
import json
# Importing environmental variables library that reads from the .env file
from dotenv import load_dotenv

# Loading key-value pairs from the .env file into the OS environment
load_dotenv()

True

In [None]:
# Reading the key-value pairs from the OS environment
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
db_hostname = os.getenv("DB_HOST")
db_name = os.getenv("DB_DATABASE")

# Using those variables in the connection string using "F" strings
conn_string = f"mysql+mysqldb://{user}:{password}@{db_hostname}/{db_name}?charset=utf8mb4"
engine = create_engine(conn_string)

In [None]:
def remove_between_delimiters(text):
    """Remove the text between two delimiters < and > and trim underscores from end of strings"""
    text = re.sub('<[^>]+>', '', text)
    text = re.sub(r'[\W_]', ' ', text)
    return text

def remove_special_characters(text, remove_digits = False):
    """Removing non-alphanumeric characters and symbols or even ocasionally numeric characters"""
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, ' ', text)
    return text

In [None]:
query = "SELECT i.tableId, i.rowIndex, i.issue_pri, i.issue_sec FROM issues i where tableId NOT IN ('02db9f91-572a-44af-9858-4add101353c1','03bfc26a-c6d0-4761-b8f5-47acf2290d02','082134c0-6a4b-425b-a4ae-e79acd7316cb','0d10a967-88d6-42e5-9bd7-309f24022b5f','333e1e53-8897-41fa-acbd-86be8afb31c7','35bd2caf-562c-4d14-a5d6-373f168b4acb','397db969-9996-4d9e-bb05-6df69d0fe4a4','417546c4-dacf-4c12-ae75-4dc4e656e198','491c36c1-82d4-46ae-a684-470915a5659b','60b3993d-7075-4790-8519-ba8193579754','64a7ba33-ceee-4593-87a3-8f08dd46c8f4','67691780-af41-414b-a0c2-aa33a3442cdc','6a2f1370-1cd5-4ebb-a4bd-a1fe9d5a516a','6b437f67-967b-4ef6-bd28-5ac8d39138e4','77cc0b8d-8244-4622-8d9d-a56daf6069e8','8bb683d9-f7ee-4a54-ad3d-dddc61ccdfcf','9476acc2-294a-4cd6-a952-8274aedb645a','a6623233-9c9f-436b-ad11-0987ab3825e7','c04807de-2df1-4d26-9352-70d3cb6cb10b','cb197d7e-3ef6-4ee0-93d1-504c7286b580','f143c6b8-cf77-41c1-88b2-e7c97ba657c1','f2ebd484-4ec2-4481-907d-17334ca4657f','f4db9fc5-3a73-499a-ab1e-ab643530ea99','fdb3d057-943a-4fab-99ac-1f4eed471512','44a33e5f-d99e-48ef-ad56-bbb516ec8796','bfafbfd0-8bb5-4283-8f5e-dd7cbcec480c');"
with engine.connect() as conn:
    df = pd.read_sql(query, conn)
    df.issue_pri = df.issue_pri.str.lower()
    df.issue_sec = df.issue_sec.str.lower()
    df['issue_pri'].fillna('', inplace = True)
    df['issue_sec'].fillna('', inplace = True)
    df['issue_pri'] = df['issue_pri'].apply(remove_between_delimiters)
    df['issue_sec'] = df['issue_sec'].apply(remove_between_delimiters)
    df['issue_pri'] = df['issue_pri'].apply(remove_special_characters, remove_digits = True)
    df['issue_sec'] = df['issue_sec'].apply(remove_special_characters, remove_digits = True)
    df = df.applymap(lambda x: x.strip() if type(x)==str else x) # delete whitespaces
    df.issue_pri = df.issue_pri.replace('\s+', ' ', regex=True) # delete extra space between text strings
    df.issue_sec = df.issue_sec.replace('\s+', ' ', regex=True)
    df.issue_pri = df.issue_pri.str.replace('weeds', 'weed')
    df.issue_pri = df.issue_pri.str.replace('wetlands', 'wetland')
    df.issue_pri = df.issue_pri.str.replace('plants', 'plant')
    df.issue_pri = df.issue_pri.str.replace('fragments', 'fragment')
    df.issue_sec = df.issue_sec.str.replace('weeds', 'weed')
    df.issue_sec = df.issue_sec.str.replace('wetlands', 'wetland')
    df.issue_sec = df.issue_sec.str.replace('plants', 'plant')
    df.issue_sec = df.issue_sec.str.replace('fragments', 'fragment')

In [None]:
sub_vec_lst = ['Erosion','Coarse Fragment','Subsidence','Topsoil Admixing','Compaction','Topsoil Loss','Weed','Rare Plant','Invasive Plant','Vegetation Establishment','Vegetation Re establishment','Wetland','Riparian Vegetation Establishment','Riparian Vegetation Re-establishment']
sub_vec_keywords = []
vec_keyword_count = []

for index, row in enumerate(df.itertuples()):
    issue_keyword_count = []
    sub_vec_1 = [] 
    for sub_vec in sub_vec_lst:
        counter = 0
        keyword = []
        if re.search(r'\b' + sub_vec.lower() + r'\b', row.issue_pri):
            keyword.append(sub_vec)
            counter += 1
        issue_keyword_count.append(counter)
        sub_vec_1.append(keyword)
        
    if sum(issue_keyword_count) == 0:
        issue_keyword_count = []
        sub_vec_1 = []
        keyword = []
        for sub_vec in sub_vec_lst:
            idx = 0
            if re.search(r'\b' + sub_vec.lower() + r'\b', row.issue_sec):
                keyword.append(sub_vec)
                idx += 1
            issue_keyword_count.append(idx)
        sub_vec_1.append(keyword)
            
    vec_keyword_count.append(issue_keyword_count)
    sub_vec_keywords.append(sub_vec_1)

In [None]:
final_lst = []
remove_empty = []
for i in sub_vec_keywords:
    lst1 = [x for x in i if x != []]
    remove_empty.append(lst1)
for j in remove_empty:
    lst2 = [num for elem in j for num in elem]
    final_lst.append(lst2)

In [None]:
df['sub_vec'] = final_lst

In [494]:
def populate_issues_subvec(data):
    update_issues_subvec_query = 'UPDATE issues SET subvec_std = %s WHERE tableId = %s and rowIndex = %s;'
    with engine.connect() as conn:
        for row in data.itertuples():
            if len(row.sub_vec) > 0:
                conn.execute(update_issues_subvec_query, (json.dumps(row.sub_vec), row.tableId, row.rowIndex))
    print("Done")

In [495]:
populate_issues_subvec(df)

Done


In [496]:
query = "SELECT * FROM issues_parsed;"
with engine.connect() as conn:
    df = pd.read_sql(query, conn)
    df.issue_parsed = df.issue_parsed.str.lower()
    df['issue_parsed'].fillna('', inplace = True)
    df['issue_parsed'] = df['issue_parsed'].apply(remove_between_delimiters)
    df['issue_parsed'] = df['issue_parsed'].apply(remove_special_characters, remove_digits = True)
    df = df.applymap(lambda x: x.strip() if type(x)==str else x) # delete whitespaces
    df.issue_parsed = df.issue_parsed.replace('\s+', ' ', regex=True) # delete extra space between text strings
    df.issue_parsed = df.issue_parsed.str.replace('weeds', 'weed')
    df.issue_parsed = df.issue_parsed.str.replace('wetlands', 'wetland')
    df.issue_parsed = df.issue_parsed.str.replace('plants', 'plant')
    df.issue_parsed = df.issue_parsed.str.replace('fragments', 'fragment')

In [497]:
sub_vec_keywords1 = []
for index, row in enumerate(df.itertuples()):
    keyword = []
    for sub_vec in sub_vec_lst:
        if re.search(r'\b' + sub_vec.lower() + r'\b', row.issue_parsed):
                keyword.append(sub_vec)
    sub_vec_keywords1.append(keyword) 

In [498]:
df['sub_vec'] = sub_vec_keywords1

In [500]:
def populate_issues_parsed_subvec(data):
    update_issues_subvec_query = 'UPDATE issues_parsed SET subvec_std = %s WHERE tableId = %s AND rowIndex = %s AND rowCounter = %s;'
    with engine.connect() as conn:
        for row in data.itertuples():
            if len(row.sub_vec) > 0:
                conn.execute(update_issues_subvec_query, (json.dumps(row.sub_vec), row.tableId, row.rowIndex, row.rowCounter))
    print("Done")

In [501]:
populate_issues_parsed_subvec(df)

Done
