In [55]:
import pandas as pd
import numpy as np

df = pd.read_csv("final_district_dataset.csv")
df.head()


Unnamed: 0,District,Year,Avg_Rainfall,Area_Sown,Wheat_Percentage,Avg_Temperature,Crop_Yield,Irrigation_Area
0,PK203,1981,3.423858,,,20.206408,,
1,PK203,1982,6.698594,,,19.510198,,
2,PK203,1983,3.331681,,,19.611718,,
3,PK203,1984,2.419928,,,19.920437,,
4,PK203,1985,2.637431,,,20.242908,,


In [56]:
# Drop meaningless columns
df = df.drop(columns=["Area_Sown", "Wheat_Percentage"])

# Fill rainfall and temperature using district-wise averages
df["Avg_Rainfall"] = df.groupby("District")["Avg_Rainfall"].transform(
    lambda x: x.fillna(x.mean())
)

df["Avg_Temperature"] = df.groupby("District")["Avg_Temperature"].transform(
    lambda x: x.fillna(x.mean())
)

# Crop_Yield: Fill with district median
df["Crop_Yield"] = df.groupby("District")["Crop_Yield"].transform(
    lambda x: x.fillna(x.median())
)

# Irrigation_Area: DO NOT FILL - never modify target
# Drop rows where target missing
df = df.dropna(subset=["Irrigation_Area"])

In [57]:
df.duplicated(subset=["District", "Year"]).sum()

df = df.groupby(["District","Year"], as_index=False).agg({
    "Avg_Rainfall": "mean",
    "Avg_Temperature": "mean",
    "Crop_Yield": "mean",
    "Irrigation_Area": "mean"
})

In [58]:
# Encode district
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["District_Code"] = le.fit_transform(df["District"])

In [59]:
# Train/Test Split
train_df = df[df["Year"] <= 2008]
test_df  = df[df["Year"] > 2008]

In [60]:
from sklearn.preprocessing import StandardScaler
num_cols = ["Avg_Rainfall", "Avg_Temperature", "Crop_Yield"]

scaler = StandardScaler()
scaler.fit(train_df[num_cols])

train_df[num_cols] = scaler.transform(train_df[num_cols])
test_df[num_cols]  = scaler.transform(test_df[num_cols])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[num_cols] = scaler.transform(train_df[num_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[num_cols]  = scaler.transform(test_df[num_cols])


In [61]:
# Feature selection
FEATURES = ["District_Code", "Year",
            "Avg_Rainfall", "Avg_Temperature", "Crop_Yield"]

TARGET = "Irrigation_Area"

X_train = train_df[FEATURES]
y_train = train_df[TARGET]

X_test  = test_df[FEATURES]
y_test  = test_df[TARGET]