# Land

In [1]:
# Automatic File Naming Utility
from output_util import OutputUtil

outputUtil = OutputUtil()

# Land Dataframe Retrieval

In [2]:
import io
import pandas as pd
from globals import LAND_PATH

start_line = 9
end_line = 28
skip_lines = [10]
lines_to_read = []
header_line = 9
header = ""

# Open the file
with open(LAND_PATH, 'r') as file:
    # Skip lines until the start_line
    for i in range(0, end_line + 1):
        # headers
        if i == header_line:
            header = next(file)
        elif (i not in skip_lines) and start_line <= i <= end_line:
            lines_to_read.append(next(file))
        else:
            next(file)

file.close()

data = header + ''.join(lines_to_read)
df = pd.read_csv(io.StringIO(data))

df.to_csv(outputUtil.generate_output_filepath(desc="land_raw"), index=False)

# Land Preprocessing

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

pop_df = pd.read_csv(outputUtil.get_curr_filepath(), thousands=",")


class LandTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, x, y=None):
        return self

    def transform(self, x: pd.DataFrame, y=None) -> pd.DataFrame:
        x = x[["Geographic name", "Land area in square kilometres, 2021"]]
        x = x.rename(columns={"Geographic name": "Region", "Land area in square kilometres, 2021": "Land Area"})
        x["Land Area"] = x["Land Area"].astype("int64")
        return x


pipe = Pipeline([
    ('land', LandTransformer()),
])

pop_df = pipe.fit_transform(pop_df)
pop_df.to_csv(outputUtil.generate_output_filepath("landpipeline"), index=False)

# Final Output of the Notebook

In [4]:
df = pd.read_csv(outputUtil.get_curr_filepath())
df.to_csv(outputUtil.get_final_filepath(), index=False)