In [None]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np 
import os
import json
from pathlib import Path
import re

In [None]:
# Importing environmental variables library that reads from the .env file
from dotenv import load_dotenv

# Loading key-value pairs from the .env file into the OS environment
load_dotenv()

# Reading the key-value pairs from the OS environment
user = os.getenv("DB_USER")
password = os.getenv("DB_PASS")
db_hostname = os.getenv("DB_HOST")
db_name = os.getenv("DB_DATABASE")

# Using those variables in the connection string using "F" strings
conn_string = f"mysql+mysqldb://{user}:{password}@{db_hostname}/{db_name}?charset=utf8mb4"
engine = create_engine(conn_string)

In [None]:
def clean_dataframe(dataframe, column_name):
    pattern = '|'.join(['<s>', '</s>',])
    dataframe[column_name] = dataframe[column_name].str.replace(pattern, '')
    dataframe[column_name] = dataframe[column_name].str.replace('\xa0', ' ')

In [None]:
query = "SELECT * FROM pcmr.issues WHERE content NOT LIKE '%%VEC%%' ORDER BY RAND() LIMIT 150;"

with engine.connect() as conn:
    df = pd.read_sql(query,conn)
    clean_dataframe(df, 'content')

data = pd.DataFrame([])
for row in df.itertuples():
    # converting JSON string to a list of lists of strings
    table_row = json.loads(row.content)
    table_row[0].append(row.tableId)
    table_row[0].append(row.rowIndex)
    table_row[1].append(row.tableId)
    table_row[1].append(row.rowIndex)
    data = data.append(table_row)
    data = data.append(pd.Series(), ignore_index=True)

In [None]:
df = pd.read_csv('wrangle_sample.csv', encoding = 'ISO-8859-1')

In [None]:
clean_dataframe(df, 'content')

data = pd.DataFrame([])
for row in df.itertuples():
    # converting JSON string to a list of lists of strings
    table_row = json.loads(row.content)
    if 'VEC' in table_row[0]:
        del table_row[1][table_row[0].index('VEC')]
        del table_row[0][table_row[0].index('VEC')]
    table_row[0].insert(0, row.tableId)
    table_row[0].insert(1, row.rowIndex)
    table_row[0].insert(2, row.VECassigned)
    table_row[0].insert(3, row.Company_label)
    table_row[0].insert(4, row.Rare)
    table_row[1].insert(0, row.tableId)
    table_row[1].insert(1, row.rowIndex)
    table_row[1].insert(2, row.VECassigned)
    table_row[1].insert(3, row.Company_label)
    table_row[1].insert(4, row.Rare)
    data = data.append(table_row)
    data = data.append(pd.Series(), ignore_index=True)

In [None]:
data.to_csv('VEC_validation.csv', encoding = 'utf-8-sig', index = False, header = None)

In [None]:
query = "SELECT * FROM pcmr.issues WHERE land_use IN ('Rare Plant (KP 1+500)','Rare Plant (KP 8+500)','Rare Plant <s>(KP 8+500)</s>','Rare Plant (KP 20+500)','Erosion (KP 20+630)','Steep Slope Low/Moderate Vegetation Establishment','Rare Plant (Cranberry Hydrostatic Site)','Rare Plant (snakeskin liverwort)','Rare Plant (northern moonwort)','Rare Plant (ascending grape fern)','Rare Plant (leather grape fern)', 'Rare Plant (Macloskey''s violet)', 'Rare plants – golden saxifrage','Road Allowance','Organic','Facility Site/ Disturbed','Rare Plant','Airstrip','Bishop Property','Road Allow ance','Stev enson Property','<s>--</s>','Disturbed Land');"

with engine.connect() as conn:
    df = pd.read_sql(query,conn)

clean_dataframe(df, 'content')
data = pd.DataFrame([])
for row in df.itertuples():
    # converting JSON string to a list of lists of strings
    table_row = json.loads(row.content)
    table_row[0].insert(0, 'tableId')
    table_row[0].insert(1, 'rowIndex')
    table_row[1].insert(0, row.tableId)
    table_row[1].insert(1, row.rowIndex)
    data = data.append(table_row)
    data = data.append(pd.Series(), ignore_index=True)

In [None]:
data.to_csv('land_use1.csv', encoding = 'utf-8-sig', index = False, header = None)

In [158]:
query = "SELECT i.tableId, i.rowIndex,lu.locNo, i.land_use, i.land_use_standardized, lm.landUse_nrcan_description FROM issues i LEFT JOIN locations l ON i.tableId = l.tableId and i.rowIndex = l.rowIndex LEFT JOIN landuse lu ON l.tableId = lu.tableId and l.rowIndex = lu.rowIndex and l.locNo = lu.locNo LEFT JOIN landuse_mapping lm ON lu.landuseId = lm.landuseId;"

with engine.connect() as conn:
    df = pd.read_sql(query,conn)

print(len(df))

12062


In [159]:
df = pd.get_dummies(data = df , columns = ["landUse_nrcan_description"])
df1 = pd.pivot_table(df, index = ['tableId', 'rowIndex'], values = ['landUse_nrcan_description_Barren land', 'landUse_nrcan_description_Cropland', 'landUse_nrcan_description_Mixed Forest','landUse_nrcan_description_Sub-polar taiga needleleaf forest', 'landUse_nrcan_description_Temperate or sub-polar broadleaf deciduous forest','landUse_nrcan_description_Temperate or sub-polar grassland','landUse_nrcan_description_Temperate or sub-polar needleleaf forest', 'landUse_nrcan_description_Temperate or sub-polar shrubland', 'landUse_nrcan_description_Urban and Built-up',	'landUse_nrcan_description_Water',	'landUse_nrcan_description_Wetland'], aggfunc = np.sum)

In [160]:
df1 = df1.replace(to_replace = r'^0*[1-9][0-9]*$', value = pd.Series(df1.columns, df1.columns), regex = True)

In [161]:
df1.to_csv('land_use_flat.csv', encoding = 'utf-8-sig')