In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine 
import os
import re

In [None]:
df = pd.read_csv('parsed_issues.csv', encoding = 'cp1252', na_filter = False)
df.insert(2, 'tableId_rowIndex', '')
df.insert(3, 'rowCounter', '')

In [None]:
df['tableId_rowIndex'] = df['tableId'] + df['rowIndex'].astype(str)

In [None]:
df['rowCounter'] = df.groupby('tableId_rowIndex').cumcount()+1

In [None]:
df = df.drop(['tableId_rowIndex'], axis = 1)
print(len(df))

In [None]:
df = df.replace(np.nan, '', regex=True)

In [None]:
df.to_csv('issue_parsed_clean.csv', index = False,encoding = 'utf-8-sig')

Preparing Test Data for Running BERT Model

In [None]:
# Importing environmental variables library that reads from the .env file
from dotenv import load_dotenv

# Loading key-value pairs from the .env file into the OS environment
load_dotenv()

# Reading the key-value pairs from the OS environment
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
db_hostname = os.getenv("DB_HOST")
db_name = os.getenv("DB_DATABASE")

# Using those variables in the connection string using "F" strings
conn_string = f"mysql+mysqldb://{user}:{password}@{db_hostname}/{db_name}?charset=utf8mb4"
engine = create_engine(conn_string)

In [None]:
query = "SELECT tableId, rowIndex, rowCounter, issue_parsed FROM pcmr.issues_parsed;"
with engine.connect() as conn:
    df = pd.read_sql(query, conn)
df.head(5)

In [None]:
df.rename(columns={'issue_parsed':'status_txt'}, inplace=True)
df.head(5)

In [None]:
test_df = pd.read_csv(r'C:\Users\t1nipun\Desktop\BERT_workshop_full\BERT_workshop\demonstrate\data\test.csv', encoding ='utf-8-sig')

In [None]:
test_data = pd.concat([test_df, df], sort=False)

In [None]:
test_data.to_csv(r'C:\Users\t1nipun\Desktop\BERT_workshop_full\BERT_workshop\demonstrate\data\test.csv', index = False,encoding = 'utf-8-sig')

Prepaing Data to populate to SQL Database

In [None]:
test_df_statusLabels = pd.read_csv(r'C:\Users\t1nipun\Desktop\BERT_workshop_full\BERT_workshop\demonstrate\test_data_BERT_results.csv', encoding ='utf-8-sig')
test_df_statusLabels

In [None]:
test_data['status'] = test_df_statusLabels['pred_status_label']

In [None]:
print(len(test_data))

In [None]:
train_data = pd.read_csv(r'C:\Users\t1nipun\Desktop\BERT_workshop_full\BERT_workshop\demonstrate\data\train.csv', encoding ='utf-8-sig')
print(len(train_data))

In [None]:
val_data = pd.read_csv(r'C:\Users\t1nipun\Desktop\BERT_workshop_full\BERT_workshop\demonstrate\data\dev.csv', encoding ='utf-8-sig')
print(len(val_data))

In [None]:
final_data = pd.concat([test_data, train_data, val_data], sort=False)
print(len(final_data))

In [None]:
final_data.to_csv('status_labels.csv', index = False,encoding = 'utf-8-sig')

Populating Status Labels on Database

In [None]:
def read_status_labels():
    df = pd.read_csv('status_labels.csv', encoding='utf-8-sig')
    return df
    
def insert_status_labels():
    set_query = 'UPDATE issues SET status = %s WHERE tableId = %s and rowIndex = %s;'
    data = read_landUseCompany()
    landUseCompany_dict = dict(zip(data.land_use, data.standardized_land_use))
    with engine.connect() as conn:
        for land_use, standardized_land_use in landUseCompany_dict.items():
            conn.execute(set_query, (standardized_land_use, land_use))
            print(f"Added {standardized_land_use} for {land_use}")
    print('Done')

In [None]:
df = pd.read_csv('status_labels.csv', encoding='utf-8-sig')

In [303]:
issues_query = "SELECT * FROM pcmr.issues where tableId NOT IN ('02db9f91-572a-44af-9858-4add101353c1','03bfc26a-c6d0-4761-b8f5-47acf2290d02', '082134c0-6a4b-425b-a4ae-e79acd7316cb', '0d10a967-88d6-42e5-9bd7-309f24022b5f', '333e1e53-8897-41fa-acbd-86be8afb31c7', '35bd2caf-562c-4d14-a5d6-373f168b4acb', '397db969-9996-4d9e-bb05-6df69d0fe4a4', '417546c4-dacf-4c12-ae75-4dc4e656e198', '491c36c1-82d4-46ae-a684-470915a5659b', '60b3993d-7075-4790-8519-ba8193579754', '64a7ba33-ceee-4593-87a3-8f08dd46c8f4', '67691780-af41-414b-a0c2-aa33a3442cdc', '6a2f1370-1cd5-4ebb-a4bd-a1fe9d5a516a', '6b437f67-967b-4ef6-bd28-5ac8d39138e4', '77cc0b8d-8244-4622-8d9d-a56daf6069e8', '8bb683d9-f7ee-4a54-ad3d-dddc61ccdfcf', '9476acc2-294a-4cd6-a952-8274aedb645a', 'a6623233-9c9f-436b-ad11-0987ab3825e7', 'c04807de-2df1-4d26-9352-70d3cb6cb10b', 'cb197d7e-3ef6-4ee0-93d1-504c7286b580', 'f143c6b8-cf77-41c1-88b2-e7c97ba657c1', 'f2ebd484-4ec2-4481-907d-17334ca4657f', 'f4db9fc5-3a73-499a-ab1e-ab643530ea99', 'fdb3d057-943a-4fab-99ac-1f4eed471512', '44a33e5f-d99e-48ef-ad56-bbb516ec8796', 'bfafbfd0-8bb5-4283-8f5e-dd7cbcec480c')";
with engine.connect() as conn:
    df = pd.read_sql(issues_query, conn)[:1]
counter = 0
for row in df.itertuples():
    #if row.status == 'Resolved' or row.status == 'Unresolved':
    if pd.isna(row.status):
        counter += 1
        print(row.tableId, row.rowIndex, 'Yay!')
print(counter)
    
    #                    item['status'] = 'Unresolved'
    # update_status_query = 'UPDATE issues SET status = %s WHERE tableId = %s and rowIndex = %s;'
    # with engine.connect() as conn:
    #     for item in data:
    #         conn.execute(update_status_query, (item['status'], item['tableId'], item['rowIndex']))
    # print("Done")


0114daa6-c048-4081-b4cf-7a9d7461fa44 1 Yay!
1


In [308]:
def read_bert_status_labels():
    df = pd.read_csv('status_labels.csv', encoding='utf-8-sig')
    return df

def populate_bert_status_labels():
    read_issues_query = "SELECT * FROM pcmr.issues where tableId NOT IN ('02db9f91-572a-44af-9858-4add101353c1','03bfc26a-c6d0-4761-b8f5-47acf2290d02', '082134c0-6a4b-425b-a4ae-e79acd7316cb', '0d10a967-88d6-42e5-9bd7-309f24022b5f', '333e1e53-8897-41fa-acbd-86be8afb31c7', '35bd2caf-562c-4d14-a5d6-373f168b4acb', '397db969-9996-4d9e-bb05-6df69d0fe4a4', '417546c4-dacf-4c12-ae75-4dc4e656e198', '491c36c1-82d4-46ae-a684-470915a5659b', '60b3993d-7075-4790-8519-ba8193579754', '64a7ba33-ceee-4593-87a3-8f08dd46c8f4', '67691780-af41-414b-a0c2-aa33a3442cdc', '6a2f1370-1cd5-4ebb-a4bd-a1fe9d5a516a', '6b437f67-967b-4ef6-bd28-5ac8d39138e4', '77cc0b8d-8244-4622-8d9d-a56daf6069e8', '8bb683d9-f7ee-4a54-ad3d-dddc61ccdfcf', '9476acc2-294a-4cd6-a952-8274aedb645a', 'a6623233-9c9f-436b-ad11-0987ab3825e7', 'c04807de-2df1-4d26-9352-70d3cb6cb10b', 'cb197d7e-3ef6-4ee0-93d1-504c7286b580', 'f143c6b8-cf77-41c1-88b2-e7c97ba657c1', 'f2ebd484-4ec2-4481-907d-17334ca4657f', 'f4db9fc5-3a73-499a-ab1e-ab643530ea99', 'fdb3d057-943a-4fab-99ac-1f4eed471512', '44a33e5f-d99e-48ef-ad56-bbb516ec8796', 'bfafbfd0-8bb5-4283-8f5e-dd7cbcec480c')";
    update_bert_status_query = 'UPDATE issues SET status = %s WHERE tableId = %s and rowIndex = %s;'
    with engine.connect() as conn:
        issues_df = pd.read_sql(read_issues_query, conn)[:1]
        for row in issues_df.itertuples():
            if pd.isna(row.status):
                data = read_bert_status_labels()
                for issue in data.itertuples():
                    conn.execute(update_bert_status_query, (issue.status, issue.tableId, issue.rowIndex))
    print("Done")

In [309]:
populate_bert_status_labels()

Done
