In [1]:
import numpy as np
import pandas as pd

import cv2
from pathlib import Path

import matplotlib.pyplot as plt

import os, glob, random
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras

In [2]:
project_path    = "/Users/k.choi/Documents/github/csiro_git"
data_path       = os.path.join(project_path, "data")
img_path        = os.path.join(data_path, "train")
# img_list        = glob.glob(os.path.join(img_path, "*.jpg")) 
# complete_df = pd.read_csv( project_path + '/stats/complete_df.csv')
data_df = pd.read_csv(os.path.join(project_path, "data", "train.csv"))
data_df['image_id'] = data_df['image_path'].apply(lambda x: Path(x).stem)

print('project_path : ' , project_path)
print('data_path : '    , data_path)
# print('img_path : '     , img_path)
# print('total number of images : ', len(img_list))
print('data_df : ', data_df.info())

project_path :  /Users/k.choi/Documents/github/csiro_git
data_path :  /Users/k.choi/Documents/github/csiro_git/data
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1785 entries, 0 to 1784
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   sample_id      1785 non-null   object 
 1   image_path     1785 non-null   object 
 2   Sampling_Date  1785 non-null   object 
 3   State          1785 non-null   object 
 4   Species        1785 non-null   object 
 5   Pre_GSHH_NDVI  1785 non-null   float64
 6   Height_Ave_cm  1785 non-null   float64
 7   target_name    1785 non-null   object 
 8   target         1785 non-null   float64
 9   image_id       1785 non-null   object 
dtypes: float64(3), object(7)
memory usage: 139.6+ KB
data_df :  None


In [3]:
## Preprocessing converting categorical variables to numerical variables

species_encoder = LabelEncoder()
state_encoder   = LabelEncoder()
 
data_df['Species']  = species_encoder.fit_transform(data_df['Species'])
data_df['State']    = state_encoder.fit_transform(data_df['State'])

print(' -- -- Species -- -- ')
for i in np.unique(data_df['Species']): print(i, species_encoder.inverse_transform([i]) )
print(' -- -- State -- -- ')
for i in np.unique(data_df['State']): print(i, state_encoder.inverse_transform([i]) )

 -- -- Species -- -- 
0 ['Clover']
1 ['Fescue']
2 ['Fescue_CrumbWeed']
3 ['Lucerne']
4 ['Mixed']
5 ['Phalaris']
6 ['Phalaris_BarleyGrass_SilverGrass_SpearGrass_Clover_Capeweed']
7 ['Phalaris_Clover']
8 ['Phalaris_Clover_Ryegrass_Barleygrass_Bromegrass']
9 ['Phalaris_Ryegrass_Clover']
10 ['Ryegrass']
11 ['Ryegrass_Clover']
12 ['SubcloverDalkeith']
13 ['SubcloverLosa']
14 ['WhiteClover']
 -- -- State -- -- 
0 ['NSW']
1 ['Tas']
2 ['Vic']
3 ['WA']


In [4]:
meta_df = data_df.groupby('image_id').first()[['Sampling_Date', 'image_path', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm']].reset_index()


dry_df      = data_df[data_df['target_name'] == 'Dry_Total_g'][['image_id', 'target']].copy(); print(len(dry_df))
green_df    = data_df[data_df['target_name'] == 'Dry_Green_g'][['image_id', 'target']].copy(); print(len(green_df))
dead_df     = data_df[data_df['target_name'] == 'Dry_Dead_g'][['image_id', 'target']].copy(); print(len(dead_df))
clover_df   = data_df[data_df['target_name'] == 'Dry_Clover_g'][['image_id', 'target']].copy(); print(len(clover_df))
gdm_df      = data_df[data_df['target_name'] == 'GDM_g'][['image_id', 'target']].copy(); print(len(gdm_df))

dry_df      = dry_df.rename(columns={'target': 'Dry_Total_g'})
green_df    = green_df.rename(columns={'target': 'Dry_Green_g'})
dead_df     = dead_df.rename(columns={'target': 'Dry_Dead_g'})
clover_df   = clover_df.rename(columns={'target': 'Dry_Clover_g'})
gdm_df      = gdm_df.rename(columns={'target': 'GDM_g'})

complete_df = pd.merge(meta_df, dry_df, on='image_id')
complete_df = pd.merge(complete_df, green_df, on='image_id')
complete_df = pd.merge(complete_df, dead_df, on='image_id')
complete_df = pd.merge(complete_df, clover_df, on='image_id')
complete_df = pd.merge(complete_df, gdm_df, on='image_id')


357
357
357
357
357


In [None]:
random_state = 121345

train_df, valid_df = train_test_split(complete_df, test_size=0.2, random_state=random_state)

excludes_cols = ['Dry_Total_g', 'Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'image_id', 'Sampling_Date']
target_cols   = ['Dry_Total_g', 'Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g']

train_inputs = train_df.drop(columns= excludes_cols)
train_targets = train_df[target_cols]

valid_inputs = valid_df.drop(columns= excludes_cols)
valid_targets = valid_df[target_cols]


In [6]:
train_inputs.columns

Index(['image_path', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm'], dtype='object')

In [7]:
train_targets.columns


Index(['Dry_Total_g', 'Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g'], dtype='object')