In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np 
import os
import json
from pathlib import Path
import re

In [10]:
# Importing environmental variables library that reads from the .env file
from dotenv import load_dotenv

# Loading key-value pairs from the .env file into the OS environment
load_dotenv()

# Reading the key-value pairs from the OS environment
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
db_hostname = os.getenv("DB_HOST")
db_name = os.getenv("DB_DATABASE")

# Using those variables in the connection string using "F" strings
conn_string = f"mysql+mysqldb://{user}:{password}@{db_hostname}/{db_name}?charset=utf8mb4"
engine = create_engine(conn_string)

In [29]:
def clean_dataframe(dataframe, column_name):
    pattern = '|'.join(['<s>', '</s>',])
    dataframe[column_name] = dataframe[column_name].str.replace(pattern, '')
    dataframe[column_name] = dataframe[column_name].str.replace('\xa0', ' ')

In [91]:
query = "SELECT * FROM pcmr.issues WHERE content NOT LIKE '%%VEC%%' ORDER BY RAND() LIMIT 150;"

with engine.connect() as conn:
    df = pd.read_sql(query,conn)
    clean_dataframe(df, 'content')

data = pd.DataFrame([])
for row in df.itertuples():
    # converting JSON string to a list of lists of strings
    table_row = json.loads(row.content)
    table_row[0].append(row.tableId)
    table_row[0].append(row.rowIndex)
    table_row[1].append(row.tableId)
    table_row[1].append(row.rowIndex)
    data = data.append(table_row)
    data = data.append(pd.Series(), ignore_index=True)

In [33]:
df = pd.read_csv('wrangle_sample.csv', encoding = 'ISO-8859-1')

In [34]:
clean_dataframe(df, 'content')

data = pd.DataFrame([])
for row in df.itertuples():
    # converting JSON string to a list of lists of strings
    table_row = json.loads(row.content)
    if 'VEC' in table_row[0]:
        del table_row[1][table_row[0].index('VEC')]
        del table_row[0][table_row[0].index('VEC')]
    table_row[0].insert(0, row.tableId)
    table_row[0].insert(1, row.rowIndex)
    table_row[0].insert(2, row.VECassigned)
    table_row[0].insert(3, row.Company_label)
    table_row[0].insert(4, row.Rare)
    table_row[1].insert(0, row.tableId)
    table_row[1].insert(1, row.rowIndex)
    table_row[1].insert(2, row.VECassigned)
    table_row[1].insert(3, row.Company_label)
    table_row[1].insert(4, row.Rare)
    data = data.append(table_row)
    data = data.append(pd.Series(), ignore_index=True)

In [35]:
data.to_csv('VEC_validation.csv', encoding = 'utf-8-sig', index = False, header = None)

In [36]:
df.columns

Index(['tableId', 'rowIndex', 'content', 'issue_pri', 'issue_sec', 'land_use',
       'loc_coord', 'loc_kp', 'loc_mer', 'loc_other', 'loc_tract', 'loc_utm',
       'status_bin', 'status_txt', 'vec_pri', 'vec_sec', 'location', 'status',
       'vec_simple', 'subvec_simple', 'VECassigned', 'soil', 'vegetation',
       'water', 'fish', 'wildlife', 'species', 'air', 'heritage', 'physical',
       'wetlands', 'acoustic', 'navigation', 'Company_label', 'Rare'],
      dtype='object')