<a href="https://colab.research.google.com/github/liangchow/Zindi-Crop/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Set Up Worksheet and Import Libraries

In [3]:
# Clone Gitub repository to Colab
from google.colab import drive
drive.mount('/content/drive')

!apt-get install git
!git clone https://github.com/liangchow/Zindi-Crop.git

# Install library
!pip install rasterio

Mounted at /content/drive
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.12).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Cloning into 'Zindi-Crop'...
remote: Enumerating objects: 43, done.[K
remote: Counting objects: 100% (43/43), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 43 (delta 20), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (43/43), 2.65 MiB | 5.57 MiB/s, done.
Resolving deltas: 100% (20/20), done.
Collecting rasterio
  Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Downloading click_plugins-1.1.1-py2.py3-none

In [4]:
# Import libraries
import os
import rasterio
import numpy as np
import pandas as pd

from pathlib import Path
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from rasterio.transform import from_origin

pd.set_option('display.max_columns', None)

# File Preview

In [5]:
# set data dir to Google Drive
DATA_DIR = Path('/content/drive/MyDrive/Zindi-Crop')
os.listdir(DATA_DIR)

['SampleSubmission.csv',
 'Train.csv',
 'StarterNotebook.ipynb',
 'Test.csv',
 'Slides',
 'introduction_to_remote_sensing.ipynb']

In [6]:
# Load files
train_df = pd.read_csv(DATA_DIR / 'Train.csv')
test_df = pd.read_csv(DATA_DIR / 'Test.csv')
sample_submission = pd.read_csv(DATA_DIR / 'SampleSubmission.csv')

# Preview file structures
train_df.shape, test_df.shape, sample_submission.shape

((7564059, 20), (4946879, 19), (16960, 2))

In [7]:
# Preview training set
train_df.sort_values(by=['ID'])

Unnamed: 0,ID,time,Green,Blue,RED,NIR,SWIR1,SWIR2,Red_Edge,Aerosols,Red_Edge_2,Red_Edge_3,Red_Edge_4,Water_vapor,Cirrus,NDVI,NDMI,NDWI,CI,Target
0,1D_0000,2021-01-04,,,,,,,,,,,,,,,,,,0.0
177,1D_0000,2023-06-08,0.4178,0.4388,0.4105,0.4846,0.3239,0.2089,0.3798,0.4197,0.4449,0.5062,0.5364,0.0914,0.0084,0.082784,0.198763,-0.074025,-0.038846,0.0
178,1D_0000,2023-06-13,0.1229,0.1391,0.0972,0.3416,0.2005,0.1116,0.1368,0.1867,0.2771,0.3659,0.4141,0.0689,0.0007,0.556974,0.260284,-0.470829,0.169231,0.0
179,1D_0000,2023-06-18,0.2705,0.2995,0.2393,0.3743,0.2186,0.1229,0.2282,0.2668,0.2963,0.3484,0.3618,0.0539,0.0023,0.220013,0.262608,-0.160980,-0.023743,0.0
180,1D_0000,2023-06-23,0.7293,0.7446,0.7718,0.7480,0.5132,0.3589,0.7287,0.7343,0.7572,0.8280,0.8548,0.1384,0.0014,-0.015660,0.186172,-0.012658,-0.028724,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7564021,id_361599946,2021-05-16,,,,,,,,,,,,,,,,,,1.0
7564022,id_361599946,2021-05-21,,,,,,,,,,,,,,,,,,1.0
7564023,id_361599946,2021-05-26,0.0873,0.1072,0.0678,0.1659,0.1293,0.0691,0.0814,0.1422,0.1493,0.1810,0.2046,0.0261,0.0041,0.419769,0.123984,-0.310427,0.091153,1.0
7564008,id_361599946,2021-03-12,,,,,,,,,,,,,,,,,,1.0


In [8]:
# Preview testing set
test_df.sort_values(by=['ID'])

Unnamed: 0,ID,time,Green,Blue,RED,NIR,SWIR1,SWIR2,Red_Edge,Aerosols,Red_Edge_2,Red_Edge_3,Red_Edge_4,Water_vapor,Cirrus,NDVI,NDMI,NDWI,CI
0,1D_0005,2021-01-04,0.1199,0.1391,0.0978,0.2329,0.1782,0.0989,0.1146,0.2127,0.2074,0.2675,0.3081,0.0503,0.0011,0.408527,0.133058,-0.320295,0.079096
177,1D_0005,2023-06-08,0.4092,0.4217,0.4175,0.4689,0.3262,0.2094,0.3934,0.4581,0.4582,0.5218,0.5503,0.0878,0.0089,0.057987,0.179474,-0.067988,-0.029720
178,1D_0005,2023-06-13,0.1580,0.1735,0.1384,0.3286,0.2159,0.1301,0.1596,0.1756,0.2680,0.3494,0.3785,0.0618,0.0009,0.407280,0.206979,-0.350596,0.071141
179,1D_0005,2023-06-18,0.2242,0.2279,0.2273,0.3142,0.2237,0.1375,0.2211,0.2637,0.3047,0.3664,0.3852,0.0501,0.0023,0.160480,0.168247,-0.167162,-0.013827
180,1D_0005,2023-06-23,0.6563,0.6897,0.6607,0.6873,0.3671,0.1976,0.6261,0.6293,0.6489,0.7125,0.7299,0.0999,0.0014,0.019733,0.303680,-0.023072,-0.026888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4946690,id_ffe082a9c,2022-03-30,0.4544,0.4499,0.4905,0.5786,0.5467,0.4126,0.4801,0.4312,0.5598,0.6304,0.6718,0.1032,0.0098,0.082406,0.028348,-0.120232,-0.010715
4946689,id_ffe082a9c,2022-03-25,0.1727,0.1904,0.1511,0.2856,0.1731,0.0962,0.1644,0.2244,0.2580,0.3037,0.3323,0.0633,0.0181,0.307992,0.245258,-0.246345,0.042155
4946688,id_ffe082a9c,2022-03-20,0.2833,0.3013,0.2677,0.3784,0.1883,0.1504,0.2823,0.3212,0.3512,0.3908,0.4105,0.2259,0.1645,0.171336,0.335451,-0.143721,0.026545
4946694,id_ffe082a9c,2022-04-19,0.0697,0.0863,0.0438,0.1364,0.0659,0.0289,0.0536,0.1189,0.1184,0.1546,0.1686,0.0177,0.0019,0.513873,0.348492,-0.323629,0.100616


In [9]:
# Preview sample submission
sample_submission.head()

Unnamed: 0,ID,Target
0,1D_0005,0
1,1D_000A,0
2,1D_000D,0
3,1D_000E,0
4,1D_0015,0


In [10]:
# A single ID represents a unique pixel over different time periods
# create a targets_df

train_targets_df = train_df.groupby('ID')['Target'].first().reset_index()
train_targets_df

Unnamed: 0,ID,Target
0,1D_0000,0.0
1,1D_0001,0.0
2,1D_0002,0.0
3,1D_0003,0.0
4,1D_0004,0.0
...,...,...
26204,id_3611dde87,1.0
26205,id_3612484c9,1.0
26206,id_3612c368f,1.0
26207,id_3614f56e9,0.0


In [11]:
# Check data distribution
# 1: cocoa; 2: palm; 3: forest
train_targets_df.Target.value_counts(normalize = True)

Unnamed: 0_level_0,proportion
Target,Unnamed: 1_level_1
0.0,0.454653
2.0,0.349498
1.0,0.195849


In [12]:
# Aggregate pixels over the different time periods to get a wholesome view
# Get columns to aggregate
agg_cols = train_df.columns.tolist()[2:-1]
agg_cols

['Green',
 'Blue',
 'RED',
 'NIR',
 'SWIR1',
 'SWIR2',
 'Red_Edge',
 'Aerosols',
 'Red_Edge_2',
 'Red_Edge_3',
 'Red_Edge_4',
 'Water_vapor',
 'Cirrus',
 'NDVI',
 'NDMI',
 'NDWI',
 'CI']

## Data Processing

In [15]:
# Sort test and train by time
train_df.sort_values(by=['time'], inplace=True)
test_df.sort_values(by=['time'], inplace=True)

In [20]:
# Training set: Fill NaN values with backward fill, i.e., today's result was yesterday's
train_df.bfill(axis=0, inplace=True)

# Testing set: Drop NaN
test_df.dropna(axis=0, inplace=True)