In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os

**Load and Preprocess Training Data**

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Sentinel2_CropSamples_FullYear_Corrected - Sentinel2_CropSamples_FullYear_Corrected.csv')
train_df.head()


Unnamed: 0,system:index,blue,crop_type,green,nir,nir08,red,rededge1,rededge2,rededge3,swir16,swir22,time,unique_id,x,y,.geo
0,20210103T101411_20210103T102025_T31NBH_0000000...,534,cocoa,636,1217,1455,626,813,1125,1255,1222,845,2021-01-03,PIXEL_0001,0.957526,6.899852,"{""type"":""MultiPoint"",""coordinates"":[]}"
1,20210103T101411_20210103T102025_T31NBH_0000000...,534,cocoa,636,1217,1455,626,813,1125,1255,1222,845,2021-01-03,PIXEL_0001,0.957526,6.899852,"{""type"":""MultiPoint"",""coordinates"":[]}"
2,20210103T101411_20210103T102025_T31NBH_0000000...,534,cocoa,636,1217,1455,626,813,1125,1255,1222,845,2021-01-03,PIXEL_0001,0.957526,6.899852,"{""type"":""MultiPoint"",""coordinates"":[]}"
3,20210103T101411_20210103T102025_T31NBH_0000000...,534,cocoa,636,1217,1455,626,813,1125,1255,1222,845,2021-01-03,PIXEL_0001,0.957526,6.899852,"{""type"":""MultiPoint"",""coordinates"":[]}"
4,20210103T101411_20210103T102025_T31NBH_0000000...,534,cocoa,636,1217,1455,626,813,1125,1255,1222,845,2021-01-03,PIXEL_0001,0.957526,6.899852,"{""type"":""MultiPoint"",""coordinates"":[]}"


In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/test (6).csv')
test_df.head()

Unnamed: 0,unique_id,time,x,y,red,nir,swir16,swir22,blue,green,rededge1,rededge2,rededge3,nir08
0,ID_01FHV4,2018-01-03 10:59:22.851,-296455.0,846395.0,0.292,0.3686,0.4173,0.3869,0.2488,0.2708,0.3211,0.3555,0.3752,0.3862
1,ID_01FHV4,2018-01-03 10:59:22.851,-296455.0,846395.0,0.292,0.3686,0.4173,0.3869,0.2488,0.2708,0.3211,0.3555,0.3752,0.3862
2,ID_01FHV4,2018-02-12 10:59:25.232,-296455.0,846395.0,0.351,0.3426,0.4817,0.4577,0.2538,0.2914,0.3684,0.3484,0.3588,0.3628
3,ID_01FHV4,2018-02-12 10:59:25.232,-296455.0,846395.0,0.351,0.3426,0.4817,0.4577,0.2538,0.2914,0.3684,0.3484,0.3588,0.3628
4,ID_01FHV4,2018-03-14 10:59:24.436,-296455.0,846395.0,0.5312,0.6296,0.6643,0.5882,0.5244,0.5308,0.6016,0.6217,0.6401,0.6404


In [None]:
train_df['crop_type'].value_counts()

Unnamed: 0_level_0,count
crop_type,Unnamed: 1_level_1
cocoa,105270
rubber,104268
oil,97465


In [None]:
train_df['time'] = pd.to_datetime(train_df['time'])

In [None]:
#first  we will Calculate NDVI (Normalized Difference Vegetation Index)
# NDVI = (NIR - RED) / (NIR + RED)
# now we are adding a small epsilon (1e-6) to the denominator to prevent division by zero errors
train_df['ndvi'] = (train_df['nir'] - train_df['red']) / (train_df['nir'] + train_df['red'] + 1e-6)

In [None]:
# now we define the features to aggregate (all spectral bands + NDVI)
# We will exclude 'system:index', 'x', 'y', '.geo' as they are not direct features for classification after aggregation
features_to_aggregate = [
    'blue', 'green', 'nir', 'nir08', 'red', 'rededge1', 'rededge2', 'rededge3', 'swir16', 'swir22', 'ndvi'
]


In [None]:
test_df["timestamp"] = pd.to_datetime(test_df["time"])

In [None]:
# Grouping by 'unique_id' and 'crop_type' and performing mean pooling for the features
# We are doing grouping by crop_type as well to keep the target variable associated with each unique_id
aggregated_df = train_df.groupby(['unique_id', 'crop_type'])[features_to_aggregate].mean().reset_index()

print("\nAggregated DataFrame (Mean Pooled Features):")
print(aggregated_df.head())
print("\nAggregated DataFrame Info:")
aggregated_df.info()


Aggregated DataFrame (Mean Pooled Features):
    unique_id crop_type         blue        green          nir        nir08  \
0  PIXEL_0001     cocoa   826.529412  1067.176471  2260.470588  2546.529412   
1  PIXEL_0002     cocoa  1033.710526  1232.026316  2580.026316  2789.289474   
2  PIXEL_0003     cocoa   806.142857  1076.595238  2820.785714  3032.357143   
3  PIXEL_0004     cocoa  1116.600000  1272.133333  2511.466667  2814.066667   
4  PIXEL_0005     cocoa   677.333333   874.611111  2927.777778  3136.111111   

           red     rededge1     rededge2     rededge3       swir16  \
0  1164.823529  1484.235294  2028.764706  2278.705882  2888.176471   
1  1266.342105  1631.631579  2290.763158  2552.236842  2865.710526   
2   981.190476  1465.738095  2425.571429  2765.857143  2641.619048   
3  1314.933333  1704.333333  2252.533333  2513.333333  3138.266667   
4   762.722222  1198.444444  2382.166667  2858.277778  2181.611111   

        swir22      ndvi  
0  2168.235294  0.315859  
1  2

In [None]:
# Let's see the distribution of crop types in the aggregated data
print("\nCrop Type Distribution (Aggregated Data):")
print(aggregated_df['crop_type'].value_counts())


Crop Type Distribution (Aggregated Data):
crop_type
cocoa     100
rubber    100
oil       100
Name: count, dtype: int64


In [None]:
# 'unique_id' is just an identifier, not a feature for the model
X = aggregated_df.drop(['unique_id', 'crop_type'], axis=1)
y = aggregated_df['crop_type']

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Get the mapping from numerical labels back to crop names
crop_classes = label_encoder.classes_
print(f"\nEncoded Crop Types: {list(enumerate(crop_classes))}")


Encoded Crop Types: [(0, 'cocoa'), (1, 'oil'), (2, 'rubber')]


In [None]:
from sklearn.preprocessing import  StandardScaler

In [None]:
    scaler = StandardScaler()
    X_train_scaled_full = scaler.fit_transform(X)

    print("Training data preprocessing complete.")
    print(f"Shape of full training features: {X.shape}")


Training data preprocessing complete.
Shape of full training features: (300, 11)


In [None]:
test_df['ndvi'] = (test_df['nir'] - test_df['red']) / (test_df['nir'] + test_df['red'] + 1e-6)

In [None]:
aggregated_df_test = test_df.groupby('unique_id')[features_to_aggregate].mean().reset_index()

In [None]:
    X_test_for_prediction = aggregated_df_test.drop('unique_id', axis=1)

    # Scaling of the test features using scaler fitted
    X_test_scaled_for_prediction = scaler.transform(X_test_for_prediction)

    print(aggregated_df_test.head())
    print(f"\nShape of test features for prediction: {X_test_for_prediction.shape}")

   unique_id      blue     green       nir     nir08       red  rededge1  \
0  ID_002AIV  0.584356  0.575075  0.678630  0.700476  0.552039  0.609075   
1  ID_0042EI  0.536195  0.525035  0.636570  0.661719  0.504694  0.548404   
2  ID_008SD4  0.557038  0.544186  0.646852  0.667090  0.520859  0.573803   
3  ID_00AQE9  0.482738  0.479424  0.586478  0.612003  0.467276  0.516267   
4  ID_00F4A9  0.583594  0.564251  0.650874  0.685945  0.546873  0.591151   

   rededge2  rededge3    swir16    swir22      ndvi  
0  0.665725  0.689273  0.459119  0.369510  0.178821  
1  0.609446  0.645887  0.470162  0.381506  0.180023  
2  0.622829  0.651830  0.464463  0.382437  0.169235  
3  0.566692  0.591748  0.471859  0.376031  0.175648  
4  0.642767  0.675590  0.484421  0.390951  0.154630  

Shape of test features for prediction: (10523, 11)


**training of Models on Full Training Data**

In [None]:
from sklearn.linear_model import LogisticRegression

***model 1***

In [None]:
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled_full, y_encoded )

***model 2***

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X, y_encoded)

***model 3***

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp_model = MLPClassifier(
    hidden_layer_sizes=(64,),
    max_iter=500,
    random_state=42,
    activation='relu',
    solver='adam',
    alpha=0.0001,
    learning_rate_init=0.001,
    early_stopping=True,
    n_iter_no_change=10
)
mlp_model.fit(X_train_scaled_full,y_encoded)

**Generating Predictions**

In [None]:
lr_pred_proba = lr_model.predict_proba(X_test_scaled_for_prediction)
lr_pred_df = pd.DataFrame(lr_pred_proba, columns=[f"LR_{c}_prob" for c in crop_classes])
lr_pred_df['ID'] = aggregated_df_test['unique_id']
# Reordering of columns to match desired output: ID, cocoa, oil, rubber
lr_pred_df = lr_pred_df[['ID'] + [f"LR_{c}_prob" for c in ['cocoa', 'oil', 'rubber']]] # Ensure order if class_names is not naturally sorted
print("\nLogistic Regression Predictions (first 5 rows):")
print(lr_pred_df.head())


Logistic Regression Predictions (first 5 rows):
          ID  LR_cocoa_prob  LR_oil_prob  LR_rubber_prob
0  ID_002AIV       0.823328     0.143194        0.033478
1  ID_0042EI       0.822529     0.144041        0.033430
2  ID_008SD4       0.829540     0.136602        0.033858
3  ID_00AQE9       0.825423     0.140973        0.033604
4  ID_00F4A9       0.838518     0.127059        0.034424


In [None]:
rf_pred_proba = rf_model.predict_proba(X_test_for_prediction) # RF uses unscaled data
rf_pred_df = pd.DataFrame(rf_pred_proba, columns=[f"RF_{c}_prob" for c in crop_classes])
rf_pred_df['ID'] = aggregated_df_test['unique_id']
rf_pred_df = rf_pred_df[['ID'] + [f"RF_{c}_prob" for c in ['cocoa', 'oil', 'rubber']]]
print("\nRandom Forest Predictions (first 5 rows):")
print(rf_pred_df.head())


Random Forest Predictions (first 5 rows):
          ID  RF_cocoa_prob  RF_oil_prob  RF_rubber_prob
0  ID_002AIV           0.28         0.34            0.38
1  ID_0042EI           0.28         0.34            0.38
2  ID_008SD4           0.28         0.34            0.38
3  ID_00AQE9           0.28         0.34            0.38
4  ID_00F4A9           0.28         0.34            0.38


In [None]:
mlp_pred_proba = mlp_model.predict_proba(X_test_scaled_for_prediction)
mlp_pred_df = pd.DataFrame(mlp_pred_proba, columns=[f"MLP_{c}_prob" for c in crop_classes])
mlp_pred_df['ID'] = aggregated_df_test['unique_id']
mlp_pred_df = mlp_pred_df[['ID'] + [f"MLP_{c}_prob" for c in ['cocoa', 'oil', 'rubber']]]
print("\nMLP Classifier Predictions (first 5 rows):")
print(mlp_pred_df.head())


MLP Classifier Predictions (first 5 rows):
          ID  MLP_cocoa_prob  MLP_oil_prob  MLP_rubber_prob
0  ID_002AIV        0.991585      0.002537         0.005878
1  ID_0042EI        0.991557      0.002541         0.005902
2  ID_008SD4        0.991810      0.002509         0.005681
3  ID_00AQE9        0.991662      0.002528         0.005811
4  ID_00F4A9        0.992139      0.002466         0.005394


In [None]:
lr_output_filename = 'logistic_regression_predictions.csv'
lr_pred_df.to_csv(lr_output_filename, index=False)
print(f"Logistic Regression predictions saved to '{lr_output_filename}'")

Logistic Regression predictions saved to 'logistic_regression_predictions.csv'


In [None]:
rf_output_filename = 'random_forest_predictions.csv'
rf_pred_df.to_csv(rf_output_filename, index=False)
print(f"Random Forest predictions saved to '{rf_output_filename}'")

Random Forest predictions saved to 'random_forest_predictions.csv'


In [None]:
mlp_output_filename = 'mlp_predictions.csv'
mlp_pred_df.to_csv(mlp_output_filename, index=False)
print(f"MLP Classifier predictions saved to '{mlp_output_filename}'")

MLP Classifier predictions saved to 'mlp_predictions.csv'
