# Cleanup script for BC Indigenous Listings data 

In [2]:
import pandas as pd
import numpy as np

In [10]:
# Step 1: Load the data
df = pd.read_csv("../bcindigenousbusinesslistings3.csv")

In [13]:
# Step 2: Inspect the data
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   business_name        1259 non-null   object 
 1   description          1135 non-null   object 
 2   web_site             699 non-null    object 
 3   city                 1258 non-null   object 
 4   latitude             1258 non-null   float64
 5   longitude            1258 non-null   float64
 6   keywords             1257 non-null   object 
 7   region               1259 non-null   object 
 8   type                 1123 non-null   object 
 9   industry_sector      1222 non-null   object 
 10  year_formed          648 non-null    float64
 11  number_of_employees  572 non-null    object 
dtypes: float64(3), object(9)
memory usage: 118.2+ KB
None
                                       business_name  \
0                                Ellipsis Energy Inc   
1  Indigenous Communit

In [12]:
# Step 3: Clean column names (lowercase and replace spaces with underscores)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('[^0-9a-zA-Z_]', '', regex=True)

In [14]:
# Step 4: Remove duplicate rows
df = df.drop_duplicates()

In [15]:
# Step 5: Trim whitespace in string columns
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [16]:
# Step 6: Standardize text (e.g., make business names lowercase)
if 'business_name' in df.columns:
    df['business_name'] = df['business_name'].str.lower()

In [17]:
# Step 7: Remove rows missing critical information
if 'business_name' in df.columns:
    df = df[df['business_name'].notna() & (df['business_name'] != '')]

In [18]:
# Step 8: Saving cleaned data
df.to_csv("cleaned_indigenous_businesses.csv", index=False)

In [19]:
# Validating the cleaned data file
clean = pd.read_csv("cleaned_indigenous_businesses.csv")
clean.head()

Unnamed: 0,business_name,description,web_site,city,latitude,longitude,keywords,region,type,industry_sector,year_formed,number_of_employees
0,ellipsis energy inc,Ellipsis Energy Inc is an Aboriginal owned com...,http://www.ellipsisenergy.ca,Moberly Lake,55.81937,-121.834602,"Ellipsis Energy Inc 21 – Mining, quarrying, an...",Northeast,Private Company,"21 – Mining, quarrying, and oil and gas extrac...",2012.0,5 to 9
1,indigenous community development & prosperity ...,ICDPRO works together with Indigenous communit...,https://indigenouscommunitydevelopment.com/,Enderby,50.551498,-119.133546,Indigenous Community Development & Prosperity ...,Thompson / Okanagan,Private Company,81 – Other services (except public administrat...,2020.0,1 to 4
2,formline construction ltd.,With over combined 30 years of experience in t...,https://www.flcon.ca/,Burnaby,49.26605,123.00584,Formline Construction Ltd. 23 – Construction,Lower Mainland / Southwest,Private Company,23 – Construction,2021.0,1 to 4
3,quilakwa investments ltd.,Quilakwa Investments Ltd. oversees several Ind...,http://www.splatsindc.com,Enderby,50.537507,-119.141955,Quilakwa Investments Ltd.,Thompson / Okanagan,Community Owned Company,72 – Accommodation and food services,1984.0,20 to 49
4,quilakwa esso,Quilakwa Esso is owned by the Splatsin Indian ...,,Enderby,50.537507,-119.141955,Quilakwa Esso 44-45 - Retail trade,Thompson / Okanagan,Community Owned Company,44-45 - Retail trade,1984.0,10 to 19
