In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyeo import classification as cls

from tpot import TPOTClassifier

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

  from numpy.core.umath_tests import inner1d


In [4]:
band_labels = ["ndvi","ci","psri","gndvi","s2_rep","ireci","s2_b", "s2_g", "s2_r", "s2_nir", "hv", "vv", "segs"]

training_data = pd.read_csv("data/training_sigs_12_bands_and_segs.csv", names=
                           ["class"]+band_labels).astype(np.uint32)

In [5]:
training_data

Unnamed: 0,class,ndvi,ci,psri,gndvi,s2_rep,ireci,s2_b,s2_g,s2_r,s2_nir,hv,vv,segs
0,3,5506,8515,919,5570,715,2301,438,592,603,2081,6223,23284,871026
1,3,4915,7483,1328,5023,722,2266,535,757,779,2285,4225,20804,871026
2,3,3987,6554,2173,4750,726,1948,590,832,1005,2338,3983,14438,49552
3,3,6586,13830,460,5982,722,4148,461,684,560,2721,3785,28567,350575
4,3,6149,13607,690,5820,721,3783,463,677,611,2563,3162,25760,350575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238030,1,6851,16527,320,6417,718,2759,258,354,303,1622,6451,32767,289291
238031,1,6987,15728,570,6512,725,3308,262,412,346,1951,4950,27877,289291
238032,1,6677,14893,376,6123,726,3387,353,496,411,2063,4279,22309,289291
238033,1,6304,13895,662,5972,726,3354,373,534,480,2118,5345,22062,289291


In [6]:
def sample_class(class_df):
    return class_df.sample(150)
    
sampled_data = training_data.groupby('class').apply(sample_class).reset_index(1, drop=True)

In [7]:
sampled_data

Unnamed: 0_level_0,class,ndvi,ci,psri,gndvi,s2_rep,ireci,s2_b,s2_g,s2_r,s2_nir,hv,vv,segs
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,6757,17324,665,6042,719,3232,247,450,353,1824,5377,15847,257028
1,1,6930,17797,26,6065,726,3606,361,493,365,2013,4410,20460,277644
1,1,7045,20167,74,6429,724,4400,369,479,382,2204,4387,32767,282614
1,1,6958,23011,156,6365,724,4473,355,473,382,2130,4540,30010,675113
1,1,7100,21290,54,6115,725,4905,393,573,403,2377,6967,18724,710412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,7,841,1311,5663,3968,684,433,676,1124,2199,2603,5870,32767,354784
7,7,1149,1910,5439,4248,710,484,533,844,1660,2091,7350,32767,354785
7,7,3733,5418,2432,4737,722,1816,620,878,1122,2459,6664,28942,41744
7,7,1294,2246,3982,2865,713,668,1009,1444,2007,2604,2309,8557,354730


In [8]:
#%matplotlib inline
#sampled_data.groupby(by="class").mean().transpose().plot()

In [9]:
#for i in range(0,12):
#    training_data.plot.scatter(x='class', y=band_labels[i])

In [10]:
labels = sampled_data['class']
features = sampled_data.loc[:,'ndvi':]

In [11]:
labels

class
1    1
1    1
1    1
1    1
1    1
    ..
7    7
7    7
7    7
7    7
7    7
Name: class, Length: 1050, dtype: uint32

In [12]:
f_train, f_test, l_train, l_test = train_test_split(features, labels)
model = SVC(kernel='rbf')
model.fit(f_train, l_train)
print(model.score(f_test, l_test))

0.11787072243346007


In [14]:
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
C_range = np.logspace(0, 1, 10, base=10)  # base = 2 for a fine tuning
gamma_range = np.logspace(0, 128, 10, base=10)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv, n_jobs=6)
grid.fit(f_train, l_train)
grid.score(f_test, l_test)

0.11787072243346007

In [15]:
#Stripping out k_neighbours; it tends to overfit.
from tpot.config.classifier import classifier_config_dict
classifier_config_dict.pop('sklearn.neighbors.KNeighborsClassifier')

searcher = TPOTClassifier(generations=40, population_size=40, cv=5,n_jobs = -1, verbosity=2, warm_start=True,
                          config_dict = classifier_config_dict,
                          periodic_checkpoint_folder = "models/tpot_interim")
searcher.fit(f_train, l_train)
searcher.score(f_test, l_test)



HBox(children=(IntProgress(value=0, description='Optimization Progress', max=1640, style=ProgressStyle(descrip…

Generation 1 - Current best internal CV score: 0.7039317296428594
Generation 2 - Current best internal CV score: 0.7242526353512739
Generation 3 - Current best internal CV score: 0.7242526353512739
Generation 4 - Current best internal CV score: 0.7329540493739204
Generation 5 - Current best internal CV score: 0.7329540493739204
Generation 6 - Current best internal CV score: 0.7329540493739204
Generation 7 - Current best internal CV score: 0.7329540493739204
Generation 8 - Current best internal CV score: 0.7329540493739204
Generation 9 - Current best internal CV score: 0.7444282251265174
Generation 10 - Current best internal CV score: 0.7444282251265174
Generation 11 - Current best internal CV score: 0.7444282251265174
Generation 12 - Current best internal CV score: 0.7444282251265174
Generation 13 - Current best internal CV score: 0.7444282251265174
Generation 14 - Current best internal CV score: 0.7444282251265174
Generation 15 - Current best internal CV score: 0.74448363882356
Genera

0.6806083650190115

In [16]:
searcher.export("gradient_booster_v2.py")

In [None]:
# Alright, let's try gridsearch instead

param_grid = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)


In [82]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
searcher.export('tpot_v2_nomalised.pkl')
model = RandomForestClassifier(LogisticRegression(RobustScaler(ZeroCount(input_matrix)), C=15.0, dual=True, penalty=l2), bootstrap=True, criterion=gini, max_features=0.9500000000000001, min_samples_leaf=7, min_samples_split=3, n_estimators=100)
model.score(f_test, l_test)

0.6342857142857142

In [47]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier

# Average CV score on the training set was:0.8129780700079303
exported_pipeline = make_pipeline(
    make_union(
        StackingEstimator(estimator=GaussianNB()),
        FunctionTransformer(copy)
    ),
    ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.4, min_samples_leaf=3, min_samples_split=2, n_estimators=100)
)

training_data = training_data[training_data['class'] != 5]
labels = training_data['class']
features = training_data.loc[:,'band_1':].drop(["segment_id"], axis = 1)


#model = exported_pipeline.fit(features, labels)
#joblib.dump(model, "extra_trees_no_builtup.pkl")

Unnamed: 0,band_1,band_2,band_3,band_4,band_5,band_6,band_7,band_8,band_9,band_10,band_11,band_12
0,7573,24811,47.0,6821,724,6674,382,522,381,2767,8229,30929
1,7977,31234,48.0,7248,726,9051,374,516,363,3235,4049,23925
2,7172,22947,145.0,6467,723,5725,401,553,425,2595,7395,28018
3,7807,26235,107.0,6919,724,9087,428,627,423,3455,8403,23644
4,7722,26577,48.0,6921,724,8942,444,627,442,3447,9040,31507
...,...,...,...,...,...,...,...,...,...,...,...,...
14282,7301,25182,61.0,6574,723,5050,337,445,336,2160,8320,24300
14283,7301,25182,61.0,6574,723,5050,337,445,336,2160,8320,24300
14284,7330,23172,101.0,6497,724,5368,370,509,369,2398,5510,28402
14285,6551,16785,302.0,6059,723,3944,420,560,476,2284,6139,21726


In [70]:
import joblib
from pyeo.classification import classify_image
from pyeo.filesystem_utilities import init_log
init_log("model_exploration.log")
image_path = "data/s2_20180219_testsite_vegIndex_s1_clipped.tif"
output_path = "outputs/extra_trees_test.tif"
model_path = "last_tested_model.pkl"
joblib.dump(model, model_path)
classify_image(image_path, model_path, output_path, apply_mask=False)

2019-10-02 10:33:36,101: INFO: ****PROCESSING START****
2019-10-02 10:33:36,151: INFO: Classifying file: data/s2_20180219_testsite_vegIndex_s1_clipped.tif
2019-10-02 10:33:36,154: INFO: Saved model     : last_tested_model.pkl
2019-10-02 10:33:36,190: INFO: Created classification image file: outputs/extra_trees_test.tif
2019-10-02 10:33:36,194: INFO: Reshaping image from GDAL to Scikit-Learn dimensions
2019-10-02 10:33:36,194: INFO: Finding good pixels without missing values
2019-10-02 10:33:36,197: INFO: image_array.shape = (1329696, 12)
2019-10-02 10:33:36,652: INFO: No. good values: 1323130
2019-10-02 10:33:36,652: INFO: Not worth filtering nodata, skipping.
2019-10-02 10:33:36,653: INFO:    All  samples: 1329696
2019-10-02 10:33:36,663: INFO:    Good samples: 1329696
2019-10-02 10:33:36,669: INFO:    Number of chunks 10 Chunk size 132969 Chunk residual 6
2019-10-02 10:33:36,671: INFO:    Classifying chunk 0 of size 132969
2019-10-02 10:33:38,005: INFO:    Classifying chunk 1 of size

'outputs/extra_trees_test.tif'