In [1]:
# Automatic File Naming Utility
from output_util import OutputUtil

outputUtil = OutputUtil()

# Population Dataframe Retrieval

In [2]:
import io
import pandas as pd
from globals import POPULATION_PATH

start_line = 10
end_line = 29
skip_lines = [11]
lines_to_read = []
header_line = 10
header = ""

# Open the file
with open(POPULATION_PATH, 'r') as file:
    # Skip lines until the start_line
    for i in range(0, end_line + 1):
        # headers
        if i == header_line:
            header = next(file)
        elif (i not in skip_lines) and start_line <= i <= end_line:
            lines_to_read.append(next(file))
        else:
            next(file)

file.close()

data = header + ''.join(lines_to_read)
df = pd.read_csv(io.StringIO(data))

df.to_csv(outputUtil.generate_output_filepath(desc="raw"), index=False)

# Population Preprocessing

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

pop_df = pd.read_csv(outputUtil.get_curr_filepath(), thousands=',')


class PopulationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, x, y=None):
        return self

    def transform(self, x: pd.DataFrame, y=None) -> pd.DataFrame:
        # drop nova scotia string
        x["Geography"] = x["Geography"].str.replace(', Nova Scotia', '')
        x.rename(columns={"Geography": "Region"}, inplace=True)

        return x


pipe = Pipeline([
    ('population', PopulationTransformer()),
])

pop_df = pipe.fit_transform(pop_df)
pop_df.to_csv(outputUtil.generate_output_filepath("pipeline"), index=False)

# Melting the Dataframe

In [4]:
pop_df = pd.read_csv(outputUtil.get_curr_filepath())
pop_df = pop_df.melt(id_vars=["Region"], var_name='Year', value_name='Population')
pop_df.to_csv(outputUtil.generate_output_filepath("melted"), index=False)

# Population Growth Rate

In [5]:
pop_df = pd.read_csv(outputUtil.get_curr_filepath())

pop_df = pop_df.sort_values(by=['Region', 'Year'])
pop_df['Population Change'] = pop_df.groupby('Region')['Population'].diff()
pop_df['Population Change'] = pop_df['Population Change'].fillna(0)
pop_df['Population Change'] = pop_df['Population Change'].astype(int)
pop_df['Population Change Percentage'] = (pop_df['Population Change'] / pop_df['Population'].shift(1)) * 100
pop_df['Population Change Percentage'] = pop_df['Population Change Percentage'].fillna(0)
pop_df['Population Change Percentage'] = pop_df['Population Change Percentage'].round(2)

pop_df.to_csv(outputUtil.generate_output_filepath("growth_rates"), index=False)

# Drop Years

In [6]:
pop_df = pd.read_csv(outputUtil.get_curr_filepath())

pop_df = pop_df[(pop_df["Year"] >= 2013) & (pop_df["Year"] <= 2020)]

pop_df.to_csv(outputUtil.generate_output_filepath("drop_years"), index=False)

# Final Output of the Notebook

In [7]:
df = pd.read_csv(outputUtil.get_curr_filepath())
df.to_csv(outputUtil.get_final_filepath(), index=False)