In [47]:
from sqlalchemy import create_engine
import pandas as pd
import os
import re
import json
from ast import literal_eval
import numpy

In [48]:
# Importing environmental variables library that reads from the .env file
from dotenv import load_dotenv

# Loading key-value pairs from the .env file into the OS environment
load_dotenv()
# Reading the key-value pairs from the OS environment
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
db_hostname = os.getenv("DB_HOST")
db_name = os.getenv("DB_DATABASE")

# Using those variables in the connection string using "F" strings
conn_string = f"mysql+mysqldb://{user}:{password}@{db_hostname}/{db_name}?charset=utf8mb4"
engine = create_engine(conn_string)

In [49]:
def remove_between_delimiters(text):
    """Remove the text between two delimiters < and > and trim underscores from end of strings"""
    text = re.sub('<[^>]+>', '', text)
    text = re.sub(r'[\W_]', ' ', text)
    return text

def remove_special_characters(text, remove_digits = False):
    """Removing non-alphanumeric characters and symbols or even ocasionally numeric characters"""
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, ' ', text)
    return text

In [50]:
text = 'June 5, 2017 - Vegetation cover on the ROW is 80 to 100% compared to the off ROW control.'
text = remove_special_characters(text, remove_digits = True)
text = re.sub(' +', ' ', text)
text

'June Vegetation cover on the ROW is to compared to the off ROW control '

In [51]:
query = "SELECT i.tableId, i.rowIndex, i.issue_pri, i.issue_sec FROM issues i where tableId NOT IN ('02db9f91-572a-44af-9858-4add101353c1','03bfc26a-c6d0-4761-b8f5-47acf2290d02','082134c0-6a4b-425b-a4ae-e79acd7316cb','0d10a967-88d6-42e5-9bd7-309f24022b5f','333e1e53-8897-41fa-acbd-86be8afb31c7','35bd2caf-562c-4d14-a5d6-373f168b4acb','397db969-9996-4d9e-bb05-6df69d0fe4a4','417546c4-dacf-4c12-ae75-4dc4e656e198','491c36c1-82d4-46ae-a684-470915a5659b','60b3993d-7075-4790-8519-ba8193579754','64a7ba33-ceee-4593-87a3-8f08dd46c8f4','67691780-af41-414b-a0c2-aa33a3442cdc','6a2f1370-1cd5-4ebb-a4bd-a1fe9d5a516a','6b437f67-967b-4ef6-bd28-5ac8d39138e4','77cc0b8d-8244-4622-8d9d-a56daf6069e8','8bb683d9-f7ee-4a54-ad3d-dddc61ccdfcf','9476acc2-294a-4cd6-a952-8274aedb645a','a6623233-9c9f-436b-ad11-0987ab3825e7','c04807de-2df1-4d26-9352-70d3cb6cb10b','cb197d7e-3ef6-4ee0-93d1-504c7286b580','f143c6b8-cf77-41c1-88b2-e7c97ba657c1','f2ebd484-4ec2-4481-907d-17334ca4657f','f4db9fc5-3a73-499a-ab1e-ab643530ea99','fdb3d057-943a-4fab-99ac-1f4eed471512','44a33e5f-d99e-48ef-ad56-bbb516ec8796','bfafbfd0-8bb5-4283-8f5e-dd7cbcec480c', '3e9e6cdb-f812-4832-b69c-b8ec0396d585');"
with engine.connect() as conn:
    df = pd.read_sql(query, conn)
    df1 = df.copy()
    df.issue_pri = df.issue_pri.str.lower()
    df.issue_sec = df.issue_sec.str.lower()
    df['issue_pri'].fillna('', inplace = True)
    df['issue_sec'].fillna('', inplace = True)
    df['issue_pri'] = df['issue_pri'].apply(remove_between_delimiters)
    df['issue_sec'] = df['issue_sec'].apply(remove_between_delimiters)
    df['issue_pri'] = df['issue_pri'].apply(remove_special_characters, remove_digits = True)
    df['issue_sec'] = df['issue_sec'].apply(remove_special_characters, remove_digits = True)
    df = df.applymap(lambda x: x.strip() if type(x)==str else x) # delete whitespaces
    df.issue_pri = df.issue_pri.replace('\s+', ' ', regex=True) # delete extra space between text strings
    df.issue_sec = df.issue_sec.replace('\s+', ' ', regex=True)
    df.issue_pri = df.issue_pri.str.replace('weeds', 'weed')
    df.issue_pri = df.issue_pri.str.replace('plants', 'plant')
    df.issue_pri = df.issue_pri.str.replace('fragments', 'fragment')
    df.issue_sec = df.issue_sec.str.replace('weeds', 'weed')
    df.issue_sec = df.issue_sec.str.replace('plants', 'plant')
    df.issue_sec = df.issue_sec.str.replace('fragments', 'fragment')

In [52]:
subvec_dict = {'Erosion': ['Erosion'], 'Coarse Fragment' : ['Coarse Fragment'], 'Subsidence': ['Subsidence'], 'Topsoil Admixing': ['Topsoil Admixing'], 'Compaction': ['Compaction'],'Topsoil Loss': ['Topsoil Loss'],'Weed': ['Weed'] ,'Rare Plant': ['Rare Plant'],'Invasive Plant': ['Invasive Plant', 'Herbicide', 'Chamomile', 'Thistle', 'Toadflax', 'Hawksbeard', 'Blite', 'Tansy', 'Phragmites', 'Hawkweed', 'Buttercup', 'Daisy', 'Cockle'], 'Vegetation Establishment': ['Vegetation Establishment', 'Vegetation Re establishment', 'Vegetation Established', 'Willows', 'Spruce', 'Pine', 'Seeded', 'Vegetation Cover'], 'Riparian Vegetation Re-establishment': ['Riparian Vegetation Establishment','Riparian Vegetation Re establishment', 'Wetland Function', 'Wetland Functionality', 'Swamp', 'Bog', 'Fen']}

In [53]:
sub_vec_keywords = []
sub_vec_keyword_count = []

for index, row in enumerate(df.itertuples()):
    issue_keyword_count = []
    for key, value in subvec_dict.items():
        counter = 0
        keyword = []
        for sub_vec in value:
            if re.search(r'\b' + sub_vec.lower() + r'\b', row.issue_pri):
                keyword.append(sub_vec)
                counter += 1
        issue_keyword_count.append(counter)
        sub_vec_keywords.append(keyword)
        
    if sum(issue_keyword_count) == 0:
        issue_keyword_count = []
        keyword = []
        for key, value in subvec_dict.items():
            idx = 0        
            for sub_vec in value:
                if re.search(r'\b' + sub_vec.lower() + r'\b', row.issue_sec):
                    keyword.append(sub_vec)
                    idx += 1
            issue_keyword_count.append(idx)
        sub_vec_keywords.append(keyword)
            
    sub_vec_keyword_count.append(issue_keyword_count)

In [54]:
# Create the pandas DataFrame  
df2 = pd.DataFrame(sub_vec_keyword_count, columns = subvec_dict.keys()) 
df2['threshold'] = 0

In [55]:
s = np.where(df2.gt(df2['threshold'],0), ['Erosion, ', 'Coarse Fragment, ', 'Subsidence, ', 'Topsoil Admixing, ', 'Compaction, ', 'Topsoil Loss, ', 'Weed, ', 'Rare Plant, ','Invasive Plant, ', 'Vegetation Establishment, ', 'Riparian Vegetation Re-establishment, ', ''], '')
sub_vecs = pd.Series([''.join(x).strip(', ') for x in s], name = "Sub_VECs")
df3 = sub_vecs.to_frame()

In [56]:
df4 = pd.concat([df1, df2, df3], axis = 1)

In [58]:
df4 = df4.assign(Sub_VECs=df4.Sub_VECs.str.split(", ")).explode('Sub_VECs')
#df4.assign(Book=df.Book.str.split(",")).explode('Book')

In [63]:
query = "SELECT tableId, rowIndex, rowCounter, issue_parsed FROM issues_parsed;"
with engine.connect() as conn:
    issue_parsed_df = pd.read_sql(query, conn)
    issue_parsed_df_copy = issue_parsed_df.copy()
    issue_parsed_df.issue_parsed = issue_parsed_df.issue_parsed.str.lower()
    issue_parsed_df['issue_parsed'].fillna('', inplace = True)
    issue_parsed_df['issue_parsed'] = issue_parsed_df['issue_parsed'].apply(remove_between_delimiters)
    issue_parsed_df['issue_parsed'] = issue_parsed_df['issue_parsed'].apply(remove_special_characters, remove_digits = True)
    issue_parsed_df = issue_parsed_df.applymap(lambda x: x.strip() if type(x)==str else x) # delete whitespaces
    issue_parsed_df.issue_parsed = issue_parsed_df.issue_parsed.replace('\s+', ' ', regex=True) # delete extra space between text strings
    issue_parsed_df.issue_parsed = issue_parsed_df.issue_parsed.str.replace('weeds', 'weed')
    issue_parsed_df.issue_parsed = issue_parsed_df.issue_parsed.str.replace('plants', 'plant')
    issue_parsed_df.issue_parsed = issue_parsed_df.issue_parsed.str.replace('fragments', 'fragment')

In [65]:
sub_vec_keyword_count_ip = []
for index, row in enumerate(issue_parsed_df.itertuples()):
    issue_keyword_count_ip = []
    for key, value in subvec_dict.items():
        counter = 0
        for sub_vec in value:
            if re.search(r'\b' + sub_vec.lower() + r'\b', row.issue_parsed):
                counter += 1
        issue_keyword_count_ip.append(counter)           
    sub_vec_keyword_count_ip.append(issue_keyword_count_ip)

In [66]:
# Create the pandas DataFrame  
sub_vec_count_ip_df = pd.DataFrame(sub_vec_keyword_count_ip, columns = subvec_dict.keys()) 
sub_vec_count_ip_df['threshold'] = 0

In [67]:
s_issue_parsed = np.where(sub_vec_count_ip_df.gt(sub_vec_count_ip_df['threshold'],0), ['Erosion, ', 'Coarse Fragment, ', 'Subsidence, ', 'Topsoil Admixing, ', 'Compaction, ', 'Topsoil Loss, ', 'Weed, ', 'Rare Plant, ','Invasive Plant, ', 'Vegetation Establishment, ', 'Riparian Vegetation Re-establishment, ', ''], '')
sub_vec_issue_parsed = pd.Series([''.join(x).strip(', ') for x in s_issue_parsed], name = "Sub_VECs")
df3_issue_parsed = sub_vec_issue_parsed.to_frame()

In [70]:
df4_issue_parsed = pd.concat([issue_parsed_df_copy, sub_vec_count_ip_df, df3_issue_parsed], axis = 1)
df4_issue_parsed = df4_issue_parsed.assign(Sub_VECs=df4_issue_parsed.Sub_VECs.str.split(", ")).explode('Sub_VECs')

In [71]:
print(len(df4), len(df4_issue_parsed), len(df4) + len(df4_issue_parsed))

3367 11616 14983


In [79]:
def read_subvec_data():
    final_df = df4.append(df4_issue_parsed, ignore_index=True, sort=False)
    final_df['Sub_VECs'].replace('', np.nan, inplace = True)
    final_df.dropna(subset=['Sub_VECs'], inplace = True)
    return final_df.where(pd.notnull(final_df), None)

In [80]:
def populate_sub_vec_table():
    insert_sub_vec_query = 'INSERT INTO sub_vecs (tableId, rowIndex, rowCounter, sub_vec) VALUES (%s, %s, %s, %s);'
    data = read_subvec_data()
    with engine.connect() as conn:
        for row in data.itertuples():
            conn.execute(insert_sub_vec_query, (row.tableId, row.rowIndex, row.rowCounter, row.Sub_VECs))
    print("Done")

In [81]:
populate_sub_vec_table()

Done
