@@ -816,7 +816,7 @@ def sliding_features(imagesdf,years=np.array([2004,2005,2006,2007,2008,2009,\
# 30%
p30 = np.nanpercentile(image_time_series,30,axis=0)
p30[np.isnan(p30)]=0
save_file(dataset, p90, rows, cols, path, base_date, varname, "perc90")
save_file(dataset, p90, rows, cols, path, base_date, varname, "perc30")

# 70%
p70 = np.nanpercentile(image_time_series,70,axis=0)
@@ -833,6 +833,286 @@ def sliding_features(imagesdf,years=np.array([2004,2005,2006,2007,2008,2009,\

return True

def sliding_features_months(imagesdf,\
years=np.array([2004,2005,2006,2007,2008,2009,2010,2011,2012,2013]),\
months=np.array([1,2,3,4,5,6,7,8,9,10,11,12]),\
yeardate_variable="yeardate",\
monthdate_variable="monthdate",\
variable=5,\
quality_variable=7,\
badqualityvalues=[-1,2,3,255],\
window=1,fillvalue=0,path=None):

# years years=np.array([2004,2005,2006,2007,2008,2009,\
# 2010,2011,2012,2013])

# column names in searched_data_frame
colnames = list(imagesdf.columns.values)

# variable name
varname=colnames[variable]
print(varname)

# finde index of date variable
idx_year = colnames.index(yeardate_variable)
idx_month = colnames.index(monthdate_variable)

for i in xrange(np.shape(years)[0]):
print(i)
for j in xrange(np.shape(months)[0]):
j=j+1
print(j)

base_date = float(years[i])
month_base_date = float(months[j])

if j==0:
subset = imagesdf.loc\
[\
(imagesdf.iloc[:,idx_year] == (base_date - window))\
| (imagesdf.iloc[:,idx_year] == (base_date))\
| (imagesdf.iloc[:,idx_year] == (base_date + window))\
]
else:

subset = imagesdf.loc\
[\
(imagesdf.iloc[:,idx_year] == (base_date - window))\
| (imagesdf.iloc[:,idx_year] == (base_date))\
| (imagesdf.iloc[:,idx_year] == (base_date + window))\
| (imagesdf.iloc[:,idx_year] == (base_date + window+1))\
]

# drop bad rows
subset = subset.drop(subset[(subset.yeardate==(base_date-window)) & (subset.monthdate<month_base_date)].index)
subset = subset.drop(subset[(subset.yeardate==(base_date+window+1)) & (subset.monthdate>=month_base_date)].index)
subset.to_csv("D:/Julian/64_ie_maps/rasters/covariates_month/nreflectance1000/2004/test.csv", sep=',', encoding='utf-8')

# initialize
dataset,rows,cols,bands = readtif(subset.iloc[0,5])
image_time_series = np.ma.zeros((len(subset.index), cols * rows),dtype=np.float64)
if quality_variable is not None:
image_time_seriesq = np.zeros((len(subset.index), cols * rows),dtype=bool)

for j in xrange(len(subset.index)):

# read images (variable of interest and associated quality product)
dataset,rows,cols,bands = readtif(subset.iloc[j,variable])

# make numpy array and flatten
band = dataset.GetRasterBand(1)
band = band.ReadAsArray(0, 0, cols, rows).astype(np.int16)
band = np.ravel(band)

if quality_variable is not None:
qdataset,qrows,qcols,qbands = readtif(subset.iloc[j,quality_variable])
qband = qdataset.GetRasterBand(1)
qband = qband.ReadAsArray(0, 0, cols, rows).astype(np.int16)
qband = np.ravel(qband)

# check which pixels have bad quality
qualityaux = np.in1d(qband,badqualityvalues)
#qualityaux = qband > 21
image_time_seriesq[j, :] = qualityaux

band[qualityaux]=fillvalue

masked = band == fillvalue

image_time_series[j, :] = np.ma.array(band,mask=masked)

# close qdataset
qdataset = None

print("first variables")

# bad data count
# image_time_seriesq = np.sum(image_time_seriesq,axis=0)
# save_file(dataset, image_time_seriesq, rows, cols, path, base_date,month_base_date, varname, "badpixels")

# means
column_means = np.ma.mean(image_time_series,axis=0,dtype=np.float64)
save_file(dataset, column_means, rows, cols, path, base_date,month_base_date, varname, "mean")

# standard deviations
column_standarddeviations = np.ma.std(image_time_series,axis=0,dtype=np.float64)
save_file(dataset, column_standarddeviations, rows, cols, path, base_date,month_base_date, varname, "std")

# coefficient of variations
column_means = 1/column_means
coefficients_of_variation = np.multiply(column_standarddeviations,column_means)
save_file(dataset, coefficients_of_variation, rows, cols, path, base_date,month_base_date, varname, "cvar")

# medians
column_medians = np.ma.median(image_time_series,axis=0)
save_file(dataset, column_medians, rows, cols, path, base_date,month_base_date, varname, "median")


print("seasonal variables")
# dry season
subsubset= subset.loc\
[\
(imagesdf.iloc[:,idx_month] == 1)\
| (imagesdf.iloc[:,idx_month] == 2)\
| (imagesdf.iloc[:,idx_month] == 3)\
| (imagesdf.iloc[:,idx_month] == 4)\
| (imagesdf.iloc[:,idx_month] == 12)\
]

# initialize
pimage_time_series = np.ma.zeros((len(subsubset.index), cols * rows),dtype=np.float64)
if quality_variable is not None:
pimage_time_seriesq = np.zeros((len(subsubset.index), cols * rows),dtype=bool)

for j in xrange(len(subsubset.index)):

# read images (variable of interest and associated quality product)
dataset,rows,cols,bands = readtif(subsubset.iloc[j,variable])

# make numpy array and flatten
band = dataset.GetRasterBand(1)
band = band.ReadAsArray(0, 0, cols, rows).astype(np.int16)
band = np.ravel(band)

if quality_variable is not None:
qdataset,qrows,qcols,qbands = readtif(subsubset.iloc[j,quality_variable])

qband = qdataset.GetRasterBand(1)
qband = qband.ReadAsArray(0, 0, cols, rows).astype(np.int16)
qband = np.ravel(qband)

qualityaux = np.in1d(qband,badqualityvalues)
#qualityaux = qband > 21
pimage_time_seriesq[j, :] = qualityaux

band[qualityaux]=fillvalue

masked = band == fillvalue


pimage_time_series[j, :] = np.ma.array(band,mask=masked)

# close qdataset
qdataset = None

# means
column_means = np.ma.mean(pimage_time_series,axis=0,dtype=np.float64)
save_file(dataset, column_means, rows, cols, path, base_date,month_base_date, varname, "drymean")

# standard deviations
column_standarddeviations = np.ma.std(pimage_time_series,axis=0,dtype=np.float64)
save_file(dataset, column_standarddeviations, rows, cols, path, base_date,month_base_date, varname, "drystd")

# coefficient of variations
column_means = 1/column_means
coefficients_of_variation = np.multiply(column_standarddeviations,column_means)
save_file(dataset, coefficients_of_variation, rows, cols, path, base_date,month_base_date, varname, "drycvar")

# medians
column_medians = np.ma.median(pimage_time_series,axis=0)
save_file(dataset, column_medians, rows, cols, path, base_date,month_base_date, varname, "drymedian")

# wet season
subsubset= subset.loc\
[\
(imagesdf.iloc[:,idx_month] == 5)\
| (imagesdf.iloc[:,idx_month] == 6)\
| (imagesdf.iloc[:,idx_month] == 7)\
| (imagesdf.iloc[:,idx_month] == 8)\
| (imagesdf.iloc[:,idx_month] == 9)\
| (imagesdf.iloc[:,idx_month] == 10)\
| (imagesdf.iloc[:,idx_month] == 11)\
]

# initialize
pimage_time_series = np.ma.zeros((len(subsubset.index), cols * rows),dtype=np.float64)

if quality_variable is not None:
pimage_time_seriesq = np.zeros((len(subsubset.index), cols * rows),dtype=bool)

for j in xrange(len(subsubset.index)):

# read images (variable of interest and associated quality product)
dataset,rows,cols,bands = readtif(subsubset.iloc[j,variable])


# make numpy array and flatten
band = dataset.GetRasterBand(1)
band = band.ReadAsArray(0, 0, cols, rows).astype(np.int16)
band = np.ravel(band)

if quality_variable is not None:
qdataset,qrows,qcols,qbands = readtif(subsubset.iloc[j,quality_variable])

qband = qdataset.GetRasterBand(1)
qband = qband.ReadAsArray(0, 0, cols, rows).astype(np.int16)
qband = np.ravel(qband)

qualityaux = np.in1d(qband,badqualityvalues)
#qualityaux = qband > 21
pimage_time_seriesq[j, :] = qualityaux

band[qualityaux]=fillvalue

masked = band == fillvalue

pimage_time_series[j, :] = np.ma.array(band,mask=masked)

# close qdataset
qdataset = None

# means
column_means = np.ma.mean(pimage_time_series,axis=0,dtype=np.float64)
save_file(dataset, column_means, rows, cols, path, base_date,month_base_date, varname, "wetmean")

# standard deviations
column_standarddeviations = np.ma.std(pimage_time_series,axis=0,dtype=np.float64)
save_file(dataset, column_standarddeviations, rows, cols, path, base_date,month_base_date, varname, "wetstd")

# coefficient of variations
column_means = 1/column_means
coefficients_of_variation = np.multiply(column_standarddeviations,column_means)
save_file(dataset, coefficients_of_variation, rows, cols, path, base_date,month_base_date, varname, "wetcvar")

# medians
column_medians = np.ma.median(pimage_time_series,axis=0)
save_file(dataset, column_medians, rows, cols, path, base_date,month_base_date, varname, "wetmedian")

print("percentiles")

# Percentiles
image_time_series = np.ma.filled(image_time_series,fill_value=np.nan)

# 20%
p20 = np.nanpercentile(image_time_series,20,axis=0)
p20[np.isnan(p20)]=0
save_file(dataset, p20, rows, cols, path, base_date,month_base_date, varname, "perc20")

# 35%
p35 = np.nanpercentile(image_time_series,35,axis=0)
p35[np.isnan(p35)]=0
save_file(dataset, p35, rows, cols, path, base_date,month_base_date, varname, "perc35")

# 65%
p65 = np.nanpercentile(image_time_series,65,axis=0)
p65[np.isnan(p65)]=0
save_file(dataset, p65, rows, cols, path, base_date,month_base_date, varname, "perc65")

# 80%
p80 = np.nanpercentile(image_time_series,80,axis=0)
p80[np.isnan(p80)]=0
save_file(dataset, p80, rows, cols, path, base_date,month_base_date, varname, "perc80")

# 95%
p95 = np.nanpercentile(image_time_series,95,axis=0)
p95[np.isnan(p95)]=0
save_file(dataset, p95, rows, cols, path, base_date,month_base_date, varname, "perc95")

# close dataset
dataset = None

return True

def calculate_spectral_angle(x,y,order=1):
'''
This operation acts upon two dimensional numpy arrays
@@ -867,19 +1147,31 @@ def calculate_spectral_correlation(x,y,order=1):

return(spectral_correlation)

def save_file(dataset, data, rows, cols, path, base_date, varname, sufix):
def save_file(dataset, data, rows, cols, path, base_date,month_base_date, varname, sufix):
"""
This method saves data to a tif file using the provided sufix.
"""

# # filter raster
datasetf,rows,cols,bands = readtif("D:/Julian/64_ie_maps/rasters/filter/bov_cbz_km2.tif")
bandf = datasetf.GetRasterBand(1)
bandf = bandf.ReadAsArray(0, 0, cols, rows).astype(np.float64)
bandf = np.ravel(bandf)

# mexico body mask
baddatamask = (bandf < 0) | (data == 0)

# image metadata
projection = dataset.GetProjection()
transform = dataset.GetGeoTransform()
driver = dataset.GetDriver()
outpath = path + str(int(base_date)) + "/"
outpath = path + str(int(base_date)) + "/" + str(int(month_base_date)) +"/"
if not os.path.exists(outpath):
os.makedirs(outpath)
name = outpath + str(int(base_date)) + "_" + varname + ".tif"
name = outpath + str(int(base_date)) +"_"+str(int(month_base_date))+ "_" + varname+"_"+sufix + ".tif"
outData = createtif(driver, rows, cols, 1, name)
data[baddatamask] = np.nan
writetif(outData, data, projection, transform, order='r')

# close dataset properly

Large diffs are not rendered by default.

@@ -0,0 +1,187 @@


import pandas as pd

import numpy as np

from geotiffio import readtif
from geotiffio import createtif
from geotiffio import writetif

import feature_extraction_tools as fe

import gc

# read csv containing all image paths
all_images = pd.read_csv("D:/Julian/64_ie_maps/julian_tables_2/modis_vegindices_1000.csv", header = 0)

# number of images
nimages = len(all_images.index)

# load filter raster
datasetf,rows,cols,bands = readtif("D:/Julian/64_ie_maps/rasters/filter/bov_cbz_km2.tif")
bandf = datasetf.GetRasterBand(1)
bandf = bandf.ReadAsArray(0, 0, cols, rows).astype(np.int32)
bandf = np.ravel(bandf)

# mexico body mask
gooddatamask = bandf >= 0
ngooddata = np.sum(gooddatamask)
print(ngooddata)

# bad quality values (MODIS QC bits)
badqualityvalues = np.array([-1,2,3,255])

# initialize table testdata
tsdata = np.zeros(((ngooddata),nimages),dtype=np.float32)

for i in xrange(len(all_images.index)):

# read images (variable of interest and associated quality product)
dataset,rows,cols,bands = readtif(all_images.iloc[i,5])

# make numpy array and flatten
band = dataset.GetRasterBand(1)
band = band.ReadAsArray(0, 0, cols, rows).astype(np.float32)
band = np.ravel(band)

qdataset,qrows,qcols,qbands = readtif(all_images.iloc[i,7])
qband = qdataset.GetRasterBand(1)
qband = qband.ReadAsArray(0, 0, cols, rows).astype(np.float32)
qband = np.ravel(qband)

# check which pixels have bad quality
qualityaux = np.in1d(qband,badqualityvalues)

band[qualityaux] = np.nan
band = band[gooddatamask]
tsdata[:,i] = band

band = None
qband = None
qdataset = None
dataset = None
bandf = None
datasetf = None

gc.collect()

np.savetxt("ndvi_table.csv",tsdata, delimiter=",")
















# training data csv
#training = pd.read_csv("D:/Julian/64_ie_maps/julian_tables_2/training_julian_year.csv", header = 0)

# add yeardata
#all_images = fe.generateyeardate(all_images,format="AAAA*MM*DD")
#all_images.to_csv("D:/Julian/64_ie_maps/julian_tables_2/allimages_year.csv", sep=',', encoding='utf-8',index=False)
#training = fe.generateyeardate(training,format="DD*MM*AAAA")
#training.to_csv("D:/Julian/64_ie_maps/julian_tables_2/training_julian_year.csv", sep=',', encoding='utf-8',index=False)

# shape file
#shapefile = "D:/Julian/64_ie_maps/conglomerate/Cong22025_lamb.shp"

# searchdates
#search,lenz = searchdates(training,all_images,positions1=[0,1,2,3,4,5])
# searchdates
#search,lenz = fe.searchdates(training,all_images)

#valuez = extract(1114473.62456,2339981.7047,image="D:/Julian/64_ie_maps/2000_2013_mean_ndvi.tif",data_type=16)

#print(valuez)

#valuez = extract(1114473.62456,2339981.7047,image="D:/Julian/64_ie_maps/2000_2013_mean_ndvi.tif",data_type=32)

#print(valuez)


# query pixels
#tablez = fe.imagestacktable(shapefile=shapefile,data_frame1=training,data_frame2=all_images)

#print(tablez)

#np.savetxt("MIR2.csv", tablez, delimiter=",")




































# shapefile
#shp = readshape("D:/Julian/64_ie_maps/conglomerate/Cong22025_lamb.shp")
#shp = shapecoordinates("Cong22025_lamb.shp")

#merged = mergedataframes(shp,training)

#merged.to_csv("merge_test.csv", sep=',', encoding='utf-8',index=False)

# jd.gcal2jd
#print(jd.gcal2jd(2000, 1, 1))
#print(jd.gcal2jd(2000, 1, 2))

#print(search.at[0,'NDVI'])

# read first image to get metadata
#dataset,rows,cols,bands = readtif(search.at[0,'NDVI'])

# save characteristics of the images to be assiged to the output image
#projection = dataset.GetProjection()
#transform = dataset.GetGeoTransform()
#driver = dataset.GetDriver()





# # initialize huge vector
# image_time_series = np.zeros((len(all_images.index), cols * rows))

# for i in xrange(len(all_images.index)):
# #for i in xrange(10):

# # read envi image i

@@ -14,31 +14,35 @@

# import training data as data frame
#data = pd.read_csv("D:/Julian/64_ie_maps/julian_tables_2/training_final_good.csv")
data = pd.read_csv("D:/Julian/64_ie_maps/cleaning_training/train_ff3.csv")
data = pd.read_csv("D:/Julian/64_ie_maps/modelling_20150702/training_tables_finales/final_train_20150716.csv")

# replace -999 flags
data = data.replace("NA",np.nan)


print(type(data))

colnames = data.columns
year_variable=284
target_variable=270
print("check year variable")
print(colnames[year_variable])

target_variable=13

print("check target variable")
print(colnames[target_variable])

year_variable=1

data = data.loc\
[\
(data.iloc[:,year_variable] == 2004)\
(data.iloc[:,year_variable] == 0)\
| (data.iloc[:,year_variable] == 2004)\
| (data.iloc[:,year_variable] == 2005)\
| (data.iloc[:,year_variable] == 2006)\
| (data.iloc[:,year_variable] == 2007)\
| (data.iloc[:,year_variable] == 2009)\
| (data.iloc[:,year_variable] == 2010)\
| (data.iloc[:,year_variable] == 2011)\
| (data.iloc[:,year_variable] == 2012)\
| (data.iloc[:,year_variable] == 2013)\
| (data.iloc[:,year_variable] == 2013)
]

#print(np.shape(data))
@@ -47,12 +51,11 @@
#print(data.head())

# initialize model
rf = RandomForestRegressor(n_estimators=1000,n_jobs=4,max_features=85,min_samples_split=5,oob_score=True)
rf = RandomForestRegressor(n_estimators=1000,n_jobs=4,max_features=64,min_samples_split=5,oob_score=True)
#rf = ExtraTreesRegressor(n_estimators=1000,n_jobs=4,max_features=30,min_samples_split=5,bootstrap=True,oob_score=True)

# indices of variables of interest (target and covariates)
selection = np.append([target_variable],range(4,258))
selection = np.append(selection,[year_variable])
selection = np.append([target_variable],range(25,216))

# select data of interest
data_selection = data.iloc[:,selection].as_matrix()
@@ -64,28 +67,34 @@
data_selection = data_selection.astype(np.float32, copy=False)

# complete cases
data_selection = data_selection[~np.isnan(data_selection).any(axis=1)]
data_selection = data_selection[np.isfinite(data_selection).any(axis=1)]
goodidx = ~np.isnan(data_selection).any(axis=1)
data_selection = data_selection[goodidx]
#data_selection = data_selection[np.isfinite(data_selection).any(axis=1)]

#np.savetxt("foo.csv",data_selection, delimiter=",")

# target variable / covariates
y = data_selection[:,0]
ylog =np.log(y)

x = data_selection[:,1:]

# split test-train
#x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y, test_size=0.2, random_state=0)

# fit model
pmodel = rf.fit(x,y)
path="D:/Julian/64_ie_maps/models/average_tree_height/1000m/random_forest/ff3_rf_1000_85_5/"
path="D:/Julian/64_ie_maps/modelling_20150702/models/AlturaTotal_media_c1c2_clean/"

joblib.dump(pmodel,path+ 'ff3_rf_1000_85_5.pkl')
joblib.dump(pmodel,path+ 'treeheight20150716.pkl')

# validation measures
oobp = pmodel.oob_prediction_
oobplog = np.exp(oobp)
oobpout = np.zeros((np.shape(data)[0]),dtype=np.float64)
oobpout[oobpout==0]=-1
oobpout[goodidx]=oobp
np.savetxt(path+"0oobprediction20150716.csv", oobpout, delimiter=",")

#oobplog = np.exp(oobp)
corrins = np.corrcoef(oobp,y)
rmse = np.sqrt(np.mean((y - oobp)**2))
mae = np.mean(np.absolute((y - oobp)))
@@ -98,6 +107,9 @@
print(mae)
print(meanofresp)

statistics = np.array([corrins[0,1],rmse,mae,meanofresp])
np.savetxt(path+"0statistics.csv",statistics, delimiter=",")

# plot

# Print the feature ranking
@@ -115,7 +127,7 @@
varimpdf['variable']=names
varimpdf['importance']=imp
varimpdf = varimpdf.sort(columns="importance",ascending=False)
varimpdf.to_csv(path+"0varimp_ff3_rf_1000_85_5.csv", sep=',', encoding='utf-8',index=False)
varimpdf.to_csv(path+"0varimportance20150716.csv", sep=',', encoding='utf-8',index=False)

#print(type(imp))
#imp_selection = imp[0:21]
@@ -21,185 +21,185 @@

import feature_extraction_tools as fe

# # import training data as data frame
# #data = pd.read_csv("D:/Julian/64_ie_maps/julian_tables_2/training_final_good.csv")
# data = pd.read_csv("D:/Julian/64_ie_maps/cleaning_training/train_ff3.csv")
# import training data as data frame
#data = pd.read_csv("D:/Julian/64_ie_maps/julian_tables_2/training_final_good.csv")
data = pd.read_csv("D:/Julian/64_ie_maps/cleaning_training/train_ff3.csv")


# # replace -999 flags
# #data = data.replace("NA",np.nan)
# replace -999 flags
#data = data.replace("NA",np.nan)

# # 16 models must be made
# variables = [260,261,262,263,265,266,267,268,270,271,272,273,275,276,277,278]
# 16 models must be made
variables = [260,261,262,263,265,266,267,268,270,271,272,273,275,276,277,278]

# colnames = data.columns
# year_variable=284
# print("check year variable")
# print(colnames[year_variable])
colnames = data.columns
year_variable=284
print("check year variable")
print(colnames[year_variable])

# for i in range(len(variables)):
for i in range(len(variables)):

# target_variable=variables[i]
# print("check target variable")
# varname =colnames[target_variable]
# print(varname)
target_variable=variables[i]
print("check target variable")
varname =colnames[target_variable]
print(varname)

# # initialize model
# rf = RandomForestRegressor(n_estimators=1000,n_jobs=4,max_features=85,min_samples_split=5,oob_score=True)
# #rf = ExtraTreesRegressor(n_estimators=1000,n_jobs=4,max_features=30,min_samples_split=5,bootstrap=True,oob_score=True)
# initialize model
rf = RandomForestRegressor(n_estimators=1000,n_jobs=4,max_features=85,min_samples_split=5,oob_score=True)
#rf = ExtraTreesRegressor(n_estimators=1000,n_jobs=4,max_features=30,min_samples_split=5,bootstrap=True,oob_score=True)

# # indices of variables of interest (target and covariates)
# selection = np.append([target_variable],range(4,258))
# selection = np.append(selection,[year_variable])
# indices of variables of interest (target and covariates)
selection = np.append([target_variable],range(4,258))
selection = np.append(selection,[year_variable])

# # select data of interest
# data_selection = data.iloc[:,selection].as_matrix()
# select data of interest
data_selection = data.iloc[:,selection].as_matrix()

# # check type of array
# #print(np.dtype(data_selection))
# check type of array
#print(np.dtype(data_selection))

# # force dtype = float32
# #data_selection = data_selection.astype(np.float32, copy=False)
# force dtype = float32
#data_selection = data_selection.astype(np.float32, copy=False)

# # complete cases
# data_selection = data_selection[~np.isnan(data_selection).any(axis=1)]
# data_selection = data_selection[np.isfinite(data_selection).any(axis=1)]
# complete cases
data_selection = data_selection[~np.isnan(data_selection).any(axis=1)]
data_selection = data_selection[np.isfinite(data_selection).any(axis=1)]

# #np.savetxt("foo.csv",data_selection, delimiter=",")
#np.savetxt("foo.csv",data_selection, delimiter=",")

# # target variable / covariates
# y = data_selection[:,0].astype(np.float64)
# x = data_selection[:,1:]
# target variable / covariates
y = data_selection[:,0].astype(np.float64)
x = data_selection[:,1:]

# # split test-train
# #x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y, test_size=0.2, random_state=0)
# split test-train
#x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y, test_size=0.2, random_state=0)

# # fit model
# pmodel = rf.fit(x,y)
# path = "D:/Julian/64_ie_maps/models/"+varname+"/"
# if not os.path.exists(path):
# os.makedirs(path)
# #path="D:/Julian/64_ie_maps/models/average_tree_height/1000m/random_forest/ff3_rf_1000_85_5/"
# fit model
pmodel = rf.fit(x,y)
path = "D:/Julian/64_ie_maps/models/"+varname+"/"
if not os.path.exists(path):
os.makedirs(path)
#path="D:/Julian/64_ie_maps/models/average_tree_height/1000m/random_forest/ff3_rf_1000_85_5/"

# joblib.dump(pmodel,path+varname+'_ff3_rf_1000_85_5.pkl')
joblib.dump(pmodel,path+varname+'_ff3_rf_1000_85_5.pkl')

# # validation measures
# oobp = pmodel.oob_prediction_
# oobplog = np.exp(oobp)
# corrins = np.corrcoef(oobp,y)
# rmse = np.sqrt(np.mean((y - oobp)**2))
# mae = np.mean(np.absolute((y - oobp)))
# meanofresp = np.mean(y)
# validation measures
oobp = pmodel.oob_prediction_
oobplog = np.exp(oobp)
corrins = np.corrcoef(oobp,y)
rmse = np.sqrt(np.mean((y - oobp)**2))
mae = np.mean(np.absolute((y - oobp)))
meanofresp = np.mean(y)

# # save model
# save model

# print(corrins)
# print(rmse)
# print(mae)
# print(meanofresp)
print(corrins)
print(rmse)
print(mae)
print(meanofresp)

# statistics = np.array([corrins[0,1],rmse,mae,meanofresp])
# np.savetxt(path+"0"+varname+"statistics.csv",statistics, delimiter=",")
statistics = np.array([corrins[0,1],rmse,mae,meanofresp])
np.savetxt(path+"0"+varname+"statistics.csv",statistics, delimiter=",")

# # plot
# plot

# # Print the feature ranking
# Print the feature ranking

# # column names in searched_data_frame
# colnames = data.columns
# column names in searched_data_frame
colnames = data.columns


# imp = pmodel.feature_importances_
# names = colnames[selection[1:]]
# imp,names = zip(*sorted(zip(imp,names)))
# index = range(len(names))
# columns = ['variable','importance']
# varimpdf = pd.DataFrame(index=index, columns=columns)
# varimpdf['variable']=names
# varimpdf['importance']=imp
# varimpdf = varimpdf.sort(columns="importance",ascending=False)
# varimpdf.to_csv(path+"0varimp_ff3_rf_1000_85_5.csv", sep=',', encoding='utf-8',index=False)
imp = pmodel.feature_importances_
names = colnames[selection[1:]]
imp,names = zip(*sorted(zip(imp,names)))
index = range(len(names))
columns = ['variable','importance']
varimpdf = pd.DataFrame(index=index, columns=columns)
varimpdf['variable']=names
varimpdf['importance']=imp
varimpdf = varimpdf.sort(columns="importance",ascending=False)
varimpdf.to_csv(path+"0varimp_ff3_rf_1000_85_5.csv", sep=',', encoding='utf-8',index=False)

gc.collect()
# gc.collect()

# import training data as data frame
#data = pd.read_csv("D:/Julian/64_ie_maps/julian_tables_2/training_final_good.csv")
data = pd.read_csv("D:/Julian/64_ie_maps/cleaning_training/train_ff3.csv")
colnames = data.columns
# # import training data as data frame
# #data = pd.read_csv("D:/Julian/64_ie_maps/julian_tables_2/training_final_good.csv")
# data = pd.read_csv("D:/Julian/64_ie_maps/cleaning_training/train_ff3.csv")
# colnames = data.columns

# 16 maps must be made
variables = [260,261,262,263,265,266,267,268,270,271,272,273,275,276,277,278]
# # 16 maps must be made
# variables = [260,261,262,263,265,266,267,268,270,271,272,273,275,276,277,278]

for i in range(len(variables)):
# for i in range(len(variables)):

target_variable=variables[i]
print("check target variable")
varname =colnames[target_variable]
print(varname)
# target_variable=variables[i]
# print("check target variable")
# varname =colnames[target_variable]
# print(varname)

# load model
path = "D:/Julian/64_ie_maps/models/"+varname+"/"
pmodel = joblib.load(path+varname+'_ff3_rf_1000_85_5.pkl')
print(type(pmodel))
# # load model
# path = "D:/Julian/64_ie_maps/models/"+varname+"/"
# pmodel = joblib.load(path+varname+'_ff3_rf_1000_85_5.pkl')
# print(type(pmodel))

# load variable selection list:
imagesdf = pd.read_csv("D:/Julian/64_ie_maps/julian_tables_2/covariatesmodels_cropped_85.csv", header = 0)
# # load variable selection list:
# imagesdf = pd.read_csv("D:/Julian/64_ie_maps/julian_tables_2/covariatesmodels_cropped_85.csv", header = 0)


# # filter raster
datasetf,rows,cols,bands = readtif("D:/Julian/64_ie_maps/rasters/filter/bov_cbz_km2.tif")
bandf = datasetf.GetRasterBand(1)
bandf = bandf.ReadAsArray(0, 0, cols, rows).astype(np.float64)
bandf = np.ravel(bandf)
# # # filter raster
# datasetf,rows,cols,bands = readtif("D:/Julian/64_ie_maps/rasters/filter/bov_cbz_km2.tif")
# bandf = datasetf.GetRasterBand(1)
# bandf = bandf.ReadAsArray(0, 0, cols, rows).astype(np.float64)
# bandf = np.ravel(bandf)

# mexico body mask
baddatamask = bandf < 0
# # mexico body mask
# baddatamask = bandf < 0

# testdata
nvar = int(len(imagesdf.index))+1
testdata = np.zeros(((cols*rows),nvar),dtype=np.float64)
# # testdata
# nvar = int(len(imagesdf.index))+1
# testdata = np.zeros(((cols*rows),nvar),dtype=np.float64)

for y in xrange(10):
year=2004+y
print(year)
for i in xrange(len(imagesdf.index)):
# for y in xrange(10):
# year=2004+y
# print(year)
# for i in xrange(len(imagesdf.index)):

# read images (variable of interest and associated quality product)
imagesdf.columns[2+y]
dataset,rows,cols,bands = readtif(imagesdf.iloc[i,2+y])
# # read images (variable of interest and associated quality product)
# imagesdf.columns[2+y]
# dataset,rows,cols,bands = readtif(imagesdf.iloc[i,2+y])

# make numpy array and flatten
band = dataset.GetRasterBand(1)
band = band.ReadAsArray(0, 0, cols, rows).astype(np.float64)
band = np.ravel(band)
if (imagesdf.iloc[i,1]=="demmean") | (imagesdf.iloc[i,1]=="demsd"):
maskmissings = (band == -1.70000000e+308)
goodbandmean= np.mean(band[~maskmissings])
mask = maskmissings & (~baddatamask)
band[mask] = goodbandmean
band[baddatamask] = np.nan
testdata[:,i] = band

testdata[:,nvar-1]=year

# remove empty cells
goodidx = ~np.isnan(testdata[:,0])
data = testdata[goodidx,:]

# fill in 0's
#for j in xrange(np.shape(data)[1]):
# mzero = data[:,j]==0
# data[mzero,j]=np.mean(data[mzero,j])

# prediction
print("predicting at last")
prediction = pmodel.predict(data)

predictionout = np.zeros((cols * rows),dtype=np.float64)
predictionout[predictionout==0]=-999

predictionout[goodidx]=prediction
outpath = "D:/Julian/64_ie_maps/rasters/products/"+varname+"/"
if not os.path.exists(outpath):
os.makedirs(outpath)
fe.save_file(dataset,predictionout, rows, cols, path=outpath, base_date=year, varname=varname, sufix="_")
# # make numpy array and flatten
# band = dataset.GetRasterBand(1)
# band = band.ReadAsArray(0, 0, cols, rows).astype(np.float64)
# band = np.ravel(band)
# if (imagesdf.iloc[i,1]=="demmean") | (imagesdf.iloc[i,1]=="demsd"):
# maskmissings = (band == -1.70000000e+308)
# goodbandmean= np.mean(band[~maskmissings])
# mask = maskmissings & (~baddatamask)
# band[mask] = goodbandmean
# band[baddatamask] = np.nan
# testdata[:,i] = band

# testdata[:,nvar-1]=year

# # remove empty cells
# goodidx = ~np.isnan(testdata[:,0])
# data = testdata[goodidx,:]

# # fill in 0's
# #for j in xrange(np.shape(data)[1]):
# # mzero = data[:,j]==0
# # data[mzero,j]=np.mean(data[mzero,j])

# # prediction
# print("predicting at last")
# prediction = pmodel.predict(data)

# predictionout = np.zeros((cols * rows),dtype=np.float64)
# predictionout[predictionout==0]=-999

# predictionout[goodidx]=prediction
# outpath = "D:/Julian/64_ie_maps/rasters/products/"+varname+"/"
# if not os.path.exists(outpath):
# os.makedirs(outpath)
# fe.save_file(dataset,predictionout, rows, cols, path=outpath, base_date=year, varname=varname, sufix="_")
@@ -0,0 +1,203 @@

import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import cross_validation
from sklearn import datasets
from sklearn.metrics import r2_score
from sklearn.externals import joblib

from geotiffio import readtif
from geotiffio import createtif
from geotiffio import writetif

import os

import gc

import feature_extraction_tools_v2 as fe

# # import training data as data frame
# #data = pd.read_csv("D:/Julian/64_ie_maps/julian_tables_2/training_final_good.csv")
# data = pd.read_csv("D:/Julian/64_ie_maps/cleaning_training/train_ff3.csv")


# # replace -999 flags
# #data = data.replace("NA",np.nan)

# # 16 models must be made
# variables = [260,261,262,263,265,266,267,268,270,271,272,273,275,276,277,278]

# colnames = data.columns
# year_variable=284
# print("check year variable")
# print(colnames[year_variable])

# for i in range(len(variables)):

# target_variable=variables[i]
# print("check target variable")
# varname =colnames[target_variable]
# print(varname)

# # initialize model
# rf = RandomForestRegressor(n_estimators=1000,n_jobs=4,max_features=85,min_samples_split=5,oob_score=True)
# #rf = ExtraTreesRegressor(n_estimators=1000,n_jobs=4,max_features=30,min_samples_split=5,bootstrap=True,oob_score=True)

# # indices of variables of interest (target and covariates)
# selection = np.append([target_variable],range(4,258))
# selection = np.append(selection,[year_variable])

# # select data of interest
# data_selection = data.iloc[:,selection].as_matrix()

# # check type of array
# #print(np.dtype(data_selection))

# # force dtype = float32
# #data_selection = data_selection.astype(np.float32, copy=False)

# # complete cases
# data_selection = data_selection[~np.isnan(data_selection).any(axis=1)]
# data_selection = data_selection[np.isfinite(data_selection).any(axis=1)]

# #np.savetxt("foo.csv",data_selection, delimiter=",")

# # target variable / covariates
# y = data_selection[:,0].astype(np.float64)
# x = data_selection[:,1:]

# # split test-train
# #x_train, x_test, y_train, y_test = cross_validation.train_test_split(x,y, test_size=0.2, random_state=0)

# # fit model
# pmodel = rf.fit(x,y)
# path = "D:/Julian/64_ie_maps/models/"+varname+"/"
# if not os.path.exists(path):
# os.makedirs(path)
# #path="D:/Julian/64_ie_maps/models/average_tree_height/1000m/random_forest/ff3_rf_1000_85_5/"

# joblib.dump(pmodel,path+varname+'_ff3_rf_1000_85_5.pkl')

# # validation measures
# oobp = pmodel.oob_prediction_
# oobplog = np.exp(oobp)
# corrins = np.corrcoef(oobp,y)
# rmse = np.sqrt(np.mean((y - oobp)**2))
# mae = np.mean(np.absolute((y - oobp)))
# meanofresp = np.mean(y)

# # save model

# print(corrins)
# print(rmse)
# print(mae)
# print(meanofresp)

# statistics = np.array([corrins[0,1],rmse,mae,meanofresp])
# np.savetxt(path+"0"+varname+"statistics.csv",statistics, delimiter=",")

# # plot

# # Print the feature ranking

# # column names in searched_data_frame
# colnames = data.columns


# imp = pmodel.feature_importances_
# names = colnames[selection[1:]]
# imp,names = zip(*sorted(zip(imp,names)))
# index = range(len(names))
# columns = ['variable','importance']
# varimpdf = pd.DataFrame(index=index, columns=columns)
# varimpdf['variable']=names
# varimpdf['importance']=imp
# varimpdf = varimpdf.sort(columns="importance",ascending=False)
# varimpdf.to_csv(path+"0varimp_ff3_rf_1000_85_5.csv", sep=',', encoding='utf-8',index=False)

gc.collect()

# import training data as data frame
#data = pd.read_csv("D:/Julian/64_ie_maps/julian_tables_2/training_final_good.csv")
data = pd.read_csv("D:/Julian/64_ie_maps/modelling_20150702/training_tables_finales/final_train_20150716.csv")
colnames = data.columns

# 16 maps must be made
variables = [13]

for i in range(len(variables)):

target_variable=variables[i]
print("check target variable")
varname =colnames[target_variable]
print(varname)

# load model
#path = "D:/Julian/64_ie_maps/models/"+varname+"/"
path = "D:/Julian/64_ie_maps/modelling_20150702/models/"
pmodel = joblib.load(path+varname+'_c1c2_clean/treeheight20150716.pkl')
print(type(pmodel))

# load variable selection list:
imagesdf = pd.read_csv("D:/Julian/64_ie_maps/modelling_20150702/covariate_tables/covariates20150702.csv", header = 0)


# # filter raster
datasetf,rows,cols,bands = readtif("D:/Julian/64_ie_maps/rasters/filter/bov_cbz_km2.tif")
bandf = datasetf.GetRasterBand(1)
bandf = bandf.ReadAsArray(0, 0, cols, rows).astype(np.float32)
bandf = np.ravel(bandf)

# mexico body mask
baddatamask = bandf < 0

# testdata
nvar = int(len(imagesdf.index))
testdata = np.zeros(((cols*rows),nvar),dtype=np.float64)

for y in xrange(1):
year="2004_1"
print(year)
for i in xrange(len(imagesdf.index)):



# read images (variable of interest and associated quality product)
dataset,rows,cols,bands = readtif(imagesdf.iloc[i,2+y])

# make numpy array and flatten
band = dataset.GetRasterBand(1)
band = band.ReadAsArray(0, 0, cols, rows).astype(np.float32)
band = np.ravel(band)

maskmissings = (band <= -1.7e+308) | np.isnan(band) | np.isneginf(band) | np.isinf(band)
goodbandmean= np.mean(band[~maskmissings])
mask = maskmissings & (~baddatamask)
band[mask] = goodbandmean
band[baddatamask] = -1
testdata[:,i] = band

#testdata[:,nvar-1]=year

# remove empty cells
goodidx = testdata[:,0]!= -1
data = testdata[goodidx,:]


# prediction
print("predicting at last")
prediction = pmodel.predict(data)

predictionout = np.zeros((cols * rows),dtype=np.float64)
predictionout[predictionout==0]=-1

predictionout[goodidx]=prediction
outpath = "D:/Julian/64_ie_maps/modelling_20150702/products/"+varname+"_c1c2_clean/"
if not os.path.exists(outpath):
os.makedirs(outpath)
fe.save_file(dataset,predictionout, rows, cols, path=outpath, base_date=2004,month_base_date=1, varname=varname, sufix="please")
@@ -0,0 +1,13 @@
import pandas as pd

import numpy as np

from geotiffio import readtif
from geotiffio import createtif
from geotiffio import writetif

import feature_extraction_tools_v2 as fe

# read csv containing all image paths for nadir corrected reflectance produdcts at 1000 m
all_images = pd.read_csv("D:/Julian/64_ie_maps/julian_tables_2/modis_gpp_1000.csv",\
header = 0)
@@ -0,0 +1,64 @@
import pandas as pd
import pylab as pl
import matplotlib.pyplot as plt
import numpy as np

from geotiffio import readtif
from geotiffio import createtif
from geotiffio import writetif

import os

from osgeo import gdal

import struct as st

# import training data as data frame
trainff3 = pd.read_csv("D:/Julian/64_ie_maps/cleaning_training/train_ff3_fixed_.csv")
rasterspath = "D:/Julian/64_ie_maps/rasters/climatic/bioclimasneotropicales3/"

# initialize
output = np.zeros((len(trainff3.index),12*3),dtype=np.float64)
layers = []


nfile = 0
for file in os.listdir(rasterspath):
if file.endswith(".tif"):

layers.append(file)
print(layers)

data_type = gdal.GDT_Float32
format ="f"

# read image
dataset,rows,cols,bands = readtif(rasterspath+file)

# image metadata
projection = dataset.GetProjection()
geotransform = dataset.GetGeoTransform()
driver = dataset.GetDriver()

band=dataset.GetRasterBand(1)

for i in xrange(len(trainff3.index)):
xcoord = trainff3.iloc[i,1]
ycoord = trainff3.iloc[i,2]
xcoord = int((xcoord - geotransform[0]) / geotransform[1])
ycoord = int((ycoord - geotransform[3]) / geotransform[5])

# pixel value
binaryvalue=band.ReadRaster(xcoord,ycoord,1,1,buf_type=data_type)
value = st.unpack(format, binaryvalue)
value = value[0]

output[i,nfile] = value

nfile=nfile+1

output = pd.DataFrame(output,columns=layers,index=trainff3.index)
output = pd.concat([trainff3,output],axis=1)

# write to disk
output.to_csv("D:/Julian/64_ie_maps/cleaning_training/train_ff3_fixed_angela.csv", sep=',', encoding='utf-8',index=False)