In [1]:
import pandas as pd

# We are setting up the path for out input and output of our CSV files to sit in data/csv
input_csv_path = '../weather_api/data/csv/br_final.csv'  # Using br_final.csv for both weather and harvest data
output_dir = '../data/csv'

# We will read the data in the CSV file and drop any rows with missing values
data = pd.read_csv(input_csv_path)
data = data.dropna()

# We will convert the relevant columns to numeric values and any errors that occur we will mark them as NaN (not a number).
#Any rows with missing values after the conversion will be dropped.
numeric_columns = ['million_60kgs_bag', 'nonbear_mill_trees', 'bear_mill_trees', 'avg_unemp_perc']
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')
data = data.dropna()

# We will calculate the yearly medians and save it to a new CSV file called yearly_medians.csv
yearly_medians = data.groupby('year')[numeric_columns].median().reset_index()
yearly_medians.to_csv(f'{output_dir}/yearly_medians.csv', index=False)
print("The yearly medians data has been saved to yearly_medians.csv.")

# We will extract the relevant harvest data from the input_csv_path and then merge this data with the yearly medians to create
# and save a new CSV file called yearly_medians_with_harvest.csv
harvest_data = data[['year', 'million_60kgs_bag', 'nonbear_mill_trees', 'bear_mill_trees', 'avg_unemp_perc']].dropna()
yearly_medians_with_harvest = pd.merge(yearly_medians, harvest_data, on='year', how='left')
yearly_medians_with_harvest.to_csv(f'{output_dir}/yearly_medians_with_harvest.csv', index=False)
print("The merged yearly medians with harvest data has been saved to yearly_medians_with_harvest.csv.")

The yearly medians data has been saved to yearly_medians.csv.
The merged yearly medians with harvest data has been saved to yearly_medians_with_harvest.csv.
