In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os
import pandas as pd

# Base directory for raw data
base_dir = "/content/drive/MyDrive/MlOps_Project/USDA Crop Dataset/data/"

# Output folder for combined crops
output_folder = "/content/drive/MyDrive/MlOps_Project/"

# Target crops and years
crops = ["Corn", "Soybean", "Cotton", "WinterWheat"]
years = ["2017", "2018", "2019", "2020", "2021", "2022"]

for crop in crops:
    # Placeholder for data of this crop
    crop_data = []

    for year in years:
        # 🛠 Fix crop name for folder
        folder_crop = crop
        if crop == "Soybean":
            folder_crop = "Soybeans"

        # Build the input file path
        filename = f"USDA_{crop}_County_{year}.csv"
        file_path = os.path.join(base_dir, folder_crop, str(year), filename)

        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue

        print(f"Processing: {file_path}")

        try:
            # Read the file
            data = pd.read_csv(file_path)

            # 🛠 Fix column names immediately
            data.columns = data.columns.str.replace(",", "_").str.strip()

            # Drop unnecessary columns
            columns_to_drop = ["reference_period_desc", "agg_level_desc", "source_desc", "domain_desc", "county_name", "state_name", "asd_desc"]
            data = data.drop(columns=columns_to_drop, errors='ignore')

            # Create FIPS code
            data['fips'] = data['state_ansi'].astype(str).str.zfill(2) + data['county_ansi'].astype(str).str.zfill(3)
            data['fips'] = data['fips'].astype(str)

            # Drop state and county ANSI
            data = data.drop(columns=["state_ansi", "county_ansi"], errors='ignore')

            # Add to crop_data
            crop_data.append(data)

        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    # After all years are done for this crop
    if crop_data:
        final_crop_df = pd.concat(crop_data, ignore_index=True)
        # Save to a separate CSV per crop
        crop_output_path = os.path.join(output_folder, f"USDA_{crop}_combined.csv")
        final_crop_df.to_csv(crop_output_path, index=False)
        print(f"Final combined file for {crop} saved at: {crop_output_path}")
    else:
        print(f"No data found for {crop}.")
