# Generating Training Data

## Step 1. Check the smoke proportions

In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
clouds = pd.read_csv('cloud amount by filenames.csv')
test_clouds = pd.read_csv('~/Competition/Datasets/cleaned_test/cloud_amount_test.csv')

In [3]:
test_clouds

Unnamed: 0.1,Unnamed: 0,fnames,clouds
0,0,test_0.csv,8e-06
1,1,test_1.csv,0.000325
2,2,test_2.csv,2e-06
3,3,test_3.csv,0.148131
4,4,test_4.csv,0.103127


We are finding models for test_3.csv and test_4.csv; Therefore, we want to list a candidate training sets that contain similar amount of smokes

In [4]:
clouds[clouds.clouds > .12][:5]

Unnamed: 0,fnames,clouds
29,train_97.csv,0.124863
30,train_131.csv,0.126667
31,train_92.csv,0.154715
32,train_72.csv,0.202768
33,train_121.csv,0.206575


First, we want to sample equal amount of cases from each of the candidates. 

To do so, we first need to find the maximum number of sampling size for each training set.

In [5]:
candidates = clouds[clouds.clouds > .12][:5].fnames.reset_index(drop = True)

In [6]:
sample_size = []

In [7]:
for f in candidates:
    df = dd.read_csv(f)
    df['cloud'] = df.linescan.eq(0)
    maximum_sample_size = df.groupby(['tgt','cloud']).x.count().compute().unstack().min().min()//1.5
    sample_size.append(maximum_sample_size)
    del df

In [8]:
sample_size

[128242.0, 30771.0, 12962.0, 7900.0, 164680.0]

We see that it seems reasonable to get 7000 from each file.

## Step 2. Preprocessing

We are splitting the datasets into smoke and no smoke; and fit different models to maximize the efficiency.

Example: train_131.csv

In [9]:
df = dd.read_csv('train_131.csv')

In [10]:
df['cloud'] = df.linescan.eq(0)

In [11]:
smoke = df.loc[df.cloud,:].compute()
nosmoke = df.loc[~df.cloud,:].compute()

For the smoke region, we would like to add two more variables:
> 1. **Distance to the closest pixel point where their linescan value is greater than 200**   
> 2. **Distance to the closest pixel point where their maskClose_1 is 255**

1. **Distance to the closest pixel point where their linescan value is greater than 200**

In [12]:
linescan_gt200 = df.loc[df.linescan.ge(200), ['x','y']].compute()

In [13]:
linescan_gt200.reset_index(drop = True, inplace = True)

In [14]:
from sklearn.neighbors import KNeighborsTransformer
knt = KNeighborsTransformer(n_neighbors = 1)
knt.fit(linescan_gt200)

KNeighborsTransformer(n_neighbors=1)

In [15]:
distance = knt.kneighbors(smoke[['x','y']])[0].reshape(-1)

In [16]:
smoke['distance']  = distance

2. **Distance to the closest pixel point where their maskClose_1 is 255**

In [17]:
mask_positive = df.loc[df.maskClose_1.eq(255), ['x','y']].compute()

In [18]:
mask_positive.reset_index(drop = True, inplace = True)

In [19]:
knt.fit(mask_positive)

KNeighborsTransformer(n_neighbors=1)

In [20]:
distance_mask = knt.kneighbors(smoke[['x','y']])[0].reshape(-1)

In [21]:
smoke['distance_mask'] = distance_mask

## Step 3. Transformation and Standardization 

First, we will drop variables that we consider not relevant for the smoke region.

In [46]:
smoke = smoke.loc[:, ['nbart_blue', 'nbart_green', 'nbart_red',
       'nbart_nir', 'nbart_swir_2', 'nbart_swir_1', 'NDVI', 'MNDWI',
                      'distance','distance_mask', 'tgt'] ]

In [47]:
smoke.reset_index(drop = True, inplace = True)

Split them into X and y

In [48]:
X = smoke.iloc[:, :-1]
y = smoke.iloc[:, -1:]

In [49]:
from sklearn.preprocessing import PowerTransformer
transformer = PowerTransformer()
X = transformer.fit_transform(X)

In addition, we would like to add quadratic and interaction terms

In [50]:
from sklearn.preprocessing import PolynomialFeatures
polyfit = PolynomialFeatures(2)
X = polyfit.fit_transform(X)
clear_polynames = polyfit.get_feature_names(['nbart_blue', 'nbart_green', 'nbart_red',
       'nbart_nir', 'nbart_swir_2', 'nbart_swir_1', 'NDVI', 'MNDWI',
                      'distance','distance_mask'])

In [55]:
pd.DataFrame({'smoke_polynames':clear_polynames}).to_csv('../test4/smoke_polyname.csv')

In [27]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [28]:
X[:,0] =1

In [29]:
X.shape

(327964, 66)

In [30]:
y.shape

(327964, 1)

In [31]:
data = pd.DataFrame(X)

In [32]:
data['tgt']  = y

In [33]:
data.groupby('tgt').sample(n = 7000)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,tgt
300473,1.0,1.498725,1.060523,1.210638,-1.320587,1.419432,1.302348,-1.727412,-0.428051,-0.694177,...,0.779800,1.396220,0.535356,-0.608651,0.182271,0.078762,-0.402216,-0.409158,-0.717015,0
120080,1.0,0.280003,0.005642,0.172949,0.008445,-0.289652,-0.492706,-0.161699,1.124559,-0.822060,...,-0.166588,0.272994,-0.070499,0.197203,-1.083689,-0.077792,-0.251690,-0.614804,-0.806362,0
1911,1.0,1.022923,0.994523,0.934733,-0.082879,0.948562,0.390440,-0.915468,1.199162,1.910836,...,-1.107526,-1.709909,-0.890445,0.326386,2.248950,0.951437,2.058201,0.878056,-0.229977,0
284932,1.0,-0.596011,-0.076721,-0.038143,0.322309,0.124787,0.199697,0.230124,-0.986424,0.996676,...,-0.212980,0.374584,0.168725,-0.020097,-1.144509,-1.044496,-0.005153,0.295536,-0.012228,0
237830,1.0,1.142874,0.890726,0.812600,-0.054654,1.043259,1.157279,-0.812394,-0.679845,-1.135491,...,0.587577,1.104760,0.726069,-0.400771,0.674328,0.569481,0.224615,0.358160,-0.108383,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176161,1.0,-1.301377,-1.265442,-1.477267,-0.350183,-0.502335,-0.501381,1.431169,-2.081047,-0.331706,...,-3.039352,-0.367159,-1.441470,2.482050,0.589701,1.822706,-0.690885,-0.344953,-0.145661,1
203080,1.0,-0.876029,-0.749522,-1.635149,1.731059,-0.907983,-0.656421,2.462578,-0.389750,-0.106512,...,-0.965756,-0.143363,-1.196585,-0.631992,-0.082642,0.104589,-0.767494,-0.581361,-0.656000,1
169734,1.0,-1.534228,-1.824756,-1.252747,-1.175787,-1.102309,-1.085951,0.520237,-1.245544,-0.847727,...,-0.645439,-0.331647,-0.068453,0.410882,0.968562,-0.073412,-0.218419,-0.630067,-0.806437,1
176167,1.0,-2.068570,-2.276553,-1.189835,-1.336825,-1.576225,-1.617192,0.550528,-1.144953,-0.559048,...,-0.627309,-0.191273,-0.037924,0.231694,0.537665,-0.132364,-0.533680,-0.655115,-0.803619,1


## Step 4. Create data for no smoke area

For the area where linescan values are greater than 0, we would like to keep the mask variables for now; they can be a good input variable for tree / forest methods.

However, we also want to create the two new variables for them as well.

In [34]:
nosmoke.reset_index(drop = True, inplace= True)

In [35]:
knt.fit(linescan_gt200)
distance = knt.kneighbors(nosmoke[['x','y']])[0].reshape(-1)
nosmoke['distance']  = distance

knt.fit(mask_positive)
distance_mask = knt.kneighbors(nosmoke[['x','y']])[0].reshape(-1)
nosmoke['distance_mask'] = distance_mask

For this dataset, since we keep the categorical variables and we don't want to transform and standardize them, we want to separate those columns

In [36]:
y = nosmoke['tgt']
X_quant = nosmoke.loc[:, ['linescan', 'nbart_blue', 'nbart_green', 'nbart_red',
       'nbart_nir', 'nbart_swir_2', 'nbart_swir_1', 'NDVI', 'MNDWI',
                          'distance','distance_mask']]
X_category = nosmoke.loc[:,['maskOpen_0', 'maskClose_0', 'maskOpen_1', 'maskClose_1', 'maskOpen_2',
       'maskClose_2', 'maskOpen_3', 'maskClose_3', 'maskOpen_4', 'maskClose_4',
       'maskOpen_5', 'maskClose_5', 'maskOpen_6', 'maskClose_6', 'maskOpen_7',
       'maskClose_7', 'maskOpen_8', 'maskClose_8', 'maskOpen_9', 'maskClose_9',
       'maskOpen_10', 'maskClose_10', 'maskOpen_11', 'maskClose_11',
       'maskOpen_12', 'maskClose_12', 'maskOpen_13', 'maskClose_13',
       'maskOpen_14', 'maskClose_14', 'maskOpen_15', 'maskClose_15',
       'maskOpen_16', 'maskClose_16', 'maskOpen_17', 'maskClose_17',
       'maskOpen_18', 'maskClose_18', 'maskOpen_19', 'maskClose_19',
       'maskOpen_20', 'maskClose_20', 'maskOpen_21', 'maskClose_21',
       'maskOpen_22', 'maskClose_22', 'maskOpen_23', 'maskClose_23',
       'maskOpen_24', 'maskClose_24']]

In [37]:
from sklearn.preprocessing import PowerTransformer
transformer = PowerTransformer()
X_quant = transformer.fit_transform(X_quant)
from sklearn.preprocessing import PolynomialFeatures
polyfit = PolynomialFeatures(2)
X_quant = polyfit.fit_transform(X_quant)
polynames = polyfit.get_feature_names(['linescan', 'nbart_blue', 'nbart_green', 'nbart_red',
       'nbart_nir', 'nbart_swir_2', 'nbart_swir_1', 'NDVI', 'MNDWI',
                          'distance','distance_mask'])
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_quant = scaler.fit_transform(X_quant)

In [58]:
pd.DataFrame({'clean_polyname': polynames}).to_csv('../test4/clean_polyname.csv')

In [38]:
X_quant[:,0] =1

In [39]:
X_quant.shape

(2261267, 78)

In [40]:
data = pd.DataFrame(np.append(X_quant, X_category.to_numpy(), axis = 1))

In [41]:
data['tgt'] = y

In [42]:
data = data.groupby('tgt').sample(7000).reset_index(drop = True)

In [43]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,tgt
0,1.0,0.129856,-0.317313,-0.278910,-0.700878,-0.017816,-0.532853,-0.172385,0.562777,-0.019345,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1.0,-2.005344,-1.812397,-1.445355,-1.915251,0.031848,-1.520942,-1.381360,1.648887,0.225703,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,1.0,-0.279292,1.075119,0.811694,-0.192983,0.927496,-0.222191,-0.000644,0.585840,2.053889,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1.0,0.605797,1.488465,1.880136,1.583652,0.079059,2.052542,2.317544,-1.489241,-1.818395,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1.0,-2.176503,-0.317313,0.523349,1.004753,1.246966,0.680804,0.610495,-0.308682,0.108501,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,1.0,-0.734373,-0.857147,-0.670173,-0.407274,0.123858,-0.357531,-0.406345,0.406082,-0.433827,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
13996,1.0,1.334555,0.608882,-0.259632,0.010315,-1.285381,-0.092674,-0.251950,-0.744765,0.329706,...,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,1
13997,1.0,-1.308237,-2.096032,-2.159314,-2.787270,0.189522,-2.399308,-2.047414,2.183812,0.550782,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
13998,1.0,-0.279292,-2.515970,-2.519401,-2.247462,0.271353,-2.102002,-1.921585,2.027704,-0.919271,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
