In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load datasets
df1 = pd.read_csv("/content/sample_data/1.crop_production_cleanedfinal.csv")
df2 = pd.read_csv("/content/sample_data/2.Crop Recommendation using Soil Properties and Weather Prediction_cleaned.csv")
df3 = pd.read_csv("/content/sample_data/3.agriculture_dataset_cleaned.csv")

# Initialize helpers
le = LabelEncoder()
scaler = MinMaxScaler()

# -------------------------------
# Dataset 1: Crop Production
# -------------------------------
# Feature: Yield per hectare
df1['Yield_per_Hectare'] = df1['Production'] / (df1['Area'] + 1e-5)
df1['Year_group']=(df1["Crop_Year"]//10)*10
# Feature: Encode categorical columns
for col in ['Crop', 'State_me', 'Season']:
    if col in df1.columns:
        df1[col] = le.fit_transform(df1[col])

# -------------------------------
# Dataset 2: Crop Recommendation
# -------------------------------
# Feature: Soil Fertility Index
df2['Soil_Fertility_Index'] = df2[['N','P','K']].mean(axis=1)
df2['Macro_to_Micro'] = (df2['N'] + df2['P'] + df2['K']) / (df2['Zn'] + df2['S'] + 1)

# -------------------------------
# Seasonal Features
# -------------------------------
# Humidity averages
df2['Humidity_Avg'] = df2[['Humidity_Winter','Humidity_Spring','Humidity_Summer','Humidity_Autumn']].mean(axis=1)

# Rainfall averages
df2['Rainfall_Avg'] = df2[['Rainfall_Winter','Rainfall_Spring','Rainfall_Summer','Rainfall_Autumn']].mean(axis=1)

# Temp ranges
for season in ['Winter','Spring','Summer','Autumn']:
    df2[f'TempRange_{season}'] = df2[f'TempMax_{season}'] - df2[f'TempMin_{season}']

# General averages
df2['TempMax_Avg'] = df2[['TempMax_Winter','TempMax_Spring','TempMax_Summer','TempMax_Autumn']].mean(axis=1)
df2['TempMin_Avg'] = df2[['TempMin_Winter','TempMin_Spring','TempMin_Summer','TempMin_Autumn']].mean(axis=1)

# Rainfall-to-Temperature ratio
df2['Rainfall_Temp_Ratio'] = df2['Rainfall_Avg'] / (df2['TempMax_Avg'] + 1)

# -------------------------------
# Encode categorical
# -------------------------------
if 'Soilcolor' in df2.columns:
    df2['Soilcolor'] = le.fit_transform(df2['Soilcolor'].astype(str))

if 'Crop_Label' in df2.columns:
    df2['Crop_Label'] = le.fit_transform(df2['Crop_Label'].astype(str))

# -------------------------------
# Scale numerical features
# -------------------------------
num_cols2 = df2.select_dtypes(include=['float64','int64']).columns
num_cols2 = [col for col in num_cols2 if col not in ['Crop_Label']]  # keep target untouched

for col in num_cols2:
    if not df2[col].isnull().all():  # skip all-NaN columns
        df2[col] = scaler.fit_transform(df2[col].values.reshape(-1,1)).flatten()

# -------------------------------
# Dataset 3: Agriculture dataset
# -------------------------------
#Efficiency features
df3['Yield_per_Acre'] = df3['Yield(tons)'] / (df3['Farm_Area(Acres)'] + 1e-5)
df3['Fertilizer_Efficiency_New'] = df3['Yield(tons)'] / (df3['Fertilizer_Used(tons)'] + 1)
df3['Pesticide_Efficiency'] = df3['Yield(tons)'] / (df3['Pesticide_Used(kg)'] + 1)
df3['Water_Efficiency'] = df3['Yield(tons)'] / (df3['Water_Usage(cubic meters)'] + 1)

# -------------------------------
# Resource Ratios
# -------------------------------
df3['Fertilizer_per_Acre'] = df3['Fertilizer_Used(tons)'] / (df3['Farm_Area(Acres)'] + 1)
df3['Pesticide_per_Acre'] = df3['Pesticide_Used(kg)'] / (df3['Farm_Area(Acres)'] + 1)
df3['Water_per_Acre'] = df3['Water_Usage(cubic meters)'] / (df3['Farm_Area(Acres)'] + 1)

# -------------------------------
# Profitability
# -------------------------------
if set(['SellingPrice','CostPrice']).issubset(df3.columns):
    df3['Profit_Loss_Ratio'] = (df3['SellingPrice'] - df3['CostPrice']) / (df3['CostPrice'] + 1e-5)

# -------------------------------
# Encode categorical variables
# -------------------------------
cat_cols3 = ['Crop_Type','Irrigation_Type','Soil_Type','Season']
for col in cat_cols3:
    if col in df3.columns:
        df3[col] = le.fit_transform(df3[col].astype(str))
# -------------------------------
# Scale numerical features
# -------------------------------
num_cols3 = df3.select_dtypes(include=['float64','int64']).columns

for col in num_cols3:
    if not df3[col].isnull().all():  # skip all-NaN columns
        df3[col] = scaler.fit_transform(df3[col].values.reshape(-1,1)).flatten()


# -------------------------------
# Save transformed datasets
# -------------------------------
df1.to_csv("1.crop_production_features.csv", index=False)
df2.to_csv("2.crop_recommendation_features.csv", index=False)
df3.to_csv("3.agriculture_dataset_features.csv", index=False)

print("Feature engineering completed.New files saved:")
print("- 1.crop_production_features.csv")
print("- 2.crop_recommendation_features.csv")
print("- 3.agriculture_dataset_features.csv")


Feature engineering completed.New files saved:
- 1.crop_production_features.csv
- 2.crop_recommendation_features.csv
- 3.agriculture_dataset_features.csv
