In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine 
import os
import re

In [None]:
df = pd.read_csv('parsed_issues.csv', encoding = 'cp1252', na_filter = False)
df.insert(2, 'tableId_rowIndex', '')
df.insert(3, 'rowCounter', '')

In [None]:
df['tableId_rowIndex'] = df['tableId'] + df['rowIndex'].astype(str)

In [None]:
df['rowCounter'] = df.groupby('tableId_rowIndex').cumcount()+1 ## to generate row count when issues is broken down into sub-issues

In [None]:
df = df.drop(['tableId_rowIndex'], axis = 1)
print(len(df))

In [None]:
df = df.replace(np.nan, '', regex=True)

In [None]:
df.to_csv('issue_parsed_clean.csv', index = False,encoding = 'utf-8-sig')

Preparing Test Data for Running BERT Model

In [3]:
# Importing environmental variables library that reads from the .env file
from dotenv import load_dotenv

# Loading key-value pairs from the .env file into the OS environment
load_dotenv()

# Reading the key-value pairs from the OS environment
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
db_hostname = os.getenv("DB_HOST")
db_name = os.getenv("DB_DATABASE")

# Using those variables in the connection string using "F" strings
conn_string = f"mysql+mysqldb://{user}:{password}@{db_hostname}/{db_name}?charset=utf8mb4"
engine = create_engine(conn_string)

In [5]:
query = "SELECT tableId, rowIndex, rowCounter, issue_parsed FROM pcmr.issues_parsed;"
with engine.connect() as conn:
    df = pd.read_sql(query, conn)
df

Unnamed: 0,tableId,rowIndex,rowCounter,issue_parsed
0,02db9f91-572a-44af-9858-4add101353c1,1,1,"June 22, 2016 – Intermittent weeds observed, r..."
1,02db9f91-572a-44af-9858-4add101353c1,1,2,"July 13, 2016 - Vegetation continues to establ..."
2,02db9f91-572a-44af-9858-4add101353c1,1,3,"July 13, 2016 - Cross drains are vegetated an..."
3,02db9f91-572a-44af-9858-4add101353c1,1,4,"July 13, 2016 - Wetlands stable and consiste..."
4,02db9f91-572a-44af-9858-4add101353c1,1,5,"July 13, 2016 - No erosion or subsidence issues."
...,...,...,...,...
9719,fdb3d057-943a-4fab-99ac-1f4eed471512,55,2,"Aug 18, 2017 - Vegetation established at 65-80..."
9720,fdb3d057-943a-4fab-99ac-1f4eed471512,55,3,"Aug 18, 2017 – Intermittent perennial sow thi..."
9721,fdb3d057-943a-4fab-99ac-1f4eed471512,55,4,"Aug 23, 2017 – Intermittent herbicide applicat..."
9722,fdb3d057-943a-4fab-99ac-1f4eed471512,56,1,.


In [6]:
df.rename(columns={'issue_parsed':'status_txt'}, inplace=True)
df.head(5)

Unnamed: 0,tableId,rowIndex,rowCounter,status_txt
0,02db9f91-572a-44af-9858-4add101353c1,1,1,"June 22, 2016 – Intermittent weeds observed, r..."
1,02db9f91-572a-44af-9858-4add101353c1,1,2,"July 13, 2016 - Vegetation continues to establ..."
2,02db9f91-572a-44af-9858-4add101353c1,1,3,"July 13, 2016 - Cross drains are vegetated an..."
3,02db9f91-572a-44af-9858-4add101353c1,1,4,"July 13, 2016 - Wetlands stable and consiste..."
4,02db9f91-572a-44af-9858-4add101353c1,1,5,"July 13, 2016 - No erosion or subsidence issues."


In [7]:
df.to_csv(r'G:\Post Construction\BERT_workshop_full\BERT_workshop\demonstrate\data\testv1.csv', index = False,encoding = 'utf-8-sig')

In [None]:
test_df = pd.read_csv(r'C:\Users\t1nipun\Desktop\BERT_workshop_full\BERT_workshop\demonstrate\data\test.csv', encoding ='utf-8-sig')

In [None]:
test_data = pd.concat([test_df, df], sort=False)

In [None]:
test_data.to_csv(r'C:\Users\t1nipun\Desktop\BERT_workshop_full\BERT_workshop\demonstrate\data\test.csv', index = False,encoding = 'utf-8-sig')

Prepaing Data to populate to SQL Database

In [2]:
test_df_statusLabels = pd.read_csv(r'C:\Users\t1nipun\Desktop\BERT_workshop_full\BERT_workshop\demonstrate\test_data_BERT_results.csv', encoding ='utf-8-sig')
test_df_statusLabels

Unnamed: 0,sentence,pred_prob,pred_status_code,pred_status_label,unresolved_prob,resolved_prob
0,Repair and monitor in 2018.,[0.9929403 0.00705972],0,Unresolved,0.992940,0.007060
1,Repair and monitor in 2018.,[0.98791146 0.01208852],0,Unresolved,0.987911,0.012089
2,Repair and monitor in 2018.,[0.99382186 0.00617815],0,Unresolved,0.993822,0.006178
3,Monitor in 2018 and repair erosion,[0.99010336 0.00989671],0,Unresolved,0.990103,0.009897
4,<s>None </s>–<s> pipeline was </s>directionall...,[0.00375306 0.9962469 ],1,Resolved,0.003753,0.996247
...,...,...,...,...,...,...
9713,No issues noted.,[0.00101157 0.99898845],1,Resolved,0.001012,0.998988
9714,"June 28, 2017 – Intermittent perennial sow thi...",[0.25778517 0.74221486],1,Resolved,0.257785,0.742215
9715,"Aug 18, 2017 - Vegetation established at 65-80...",[0.04734606 0.952654 ],1,Resolved,0.047346,0.952654
9716,.,[0.01824657 0.98175347],1,Resolved,0.018247,0.981753


In [4]:
test_data = pd.read_csv(r'C:\Users\t1nipun\Desktop\BERT_workshop_full\BERT_workshop\demonstrate\data\test.csv', encoding ='utf-8-sig')

In [5]:
test_data['status'] = test_df_statusLabels['pred_status_label']
test_data['unresolved_prob'] = test_df_statusLabels['unresolved_prob']
test_data['resolved_prob'] = test_df_statusLabels['resolved_prob']

In [6]:
print(len(test_data))

9718


In [10]:
train_data = pd.read_csv(r'C:\Users\t1nipun\Desktop\BERT_workshop_full\BERT_workshop\demonstrate\data\train.csv', encoding ='utf-8-sig')
print(len(train_data))

1783


In [12]:
val_data = pd.read_csv(r'C:\Users\t1nipun\Desktop\BERT_workshop_full\BERT_workshop\demonstrate\predicted_valSet.csv', encoding ='cp1252')
print(len(val_data))

203


In [13]:
final_data = pd.concat([test_data, train_data, val_data], sort=False)
print(len(final_data))

11704


In [15]:
final_data.to_csv('status_labels.csv', index = False,encoding = 'utf-8-sig')