In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pdal

# FEATURE ENGINEERING/JOINING
---------

This notebook brings in and merges the features generated from the pdal pipeline (most of this is with the use of json files invoked on the command line using PDAL).  It was made along side the EDA notebook + command line .json shennanigans as a repository for data to be prepped for modelling. The process is outlined below.

1. Decimate (sampling based on KNN) to <5GB. <-- see decimator.json (this .json was used twice with a modified KNN value and input file, see step 4 below.)
  
2. Open in CAD environment (rhino6 3D): determine where global 0 is, choose a region of interest, determine rotation value to align object to X/Y axes, determine crop boundary.

3. Perform transformations+crops on the original (non-decimated) data set. <-- rotator.json, cropper.json

4. Perform less aggressive decimation to make life easier. <-- decimator.json (second time)
  
4. Open data set in a dataframe and plot an 'architectural plan' and section, find a 'fine tuned' crop boundary. <-- done in the early EDA notebook

5. Extract eigenvalue features, local outlier features, and target <-- classifier.json, classifier-eig.json, classifier-lof.json

6. use the dimensioner function (seen below) to transform 'linear','planar','scattering' values into binaries for and then merged for multiclass.

7. Take equal samples of the classes based on the class with the lowest quantity for model training and export. 

*somewhere along the way I cropped out the ground of the model using cropper-copy1.json. This was because after running the voxelizer in the early EDA notebook, I noticed an strange density of points in the corners where the wall meets the ground throughout the 3D space, which I thought may become an issue down the line.*
  

In [1]:
data = "Armenia-cropped2-classified.las"

json = """
    {
      "pipeline": [
        {
            "type": "readers.bpf",
            "filename": "%s"
        }
      ]
    }"""


pipeline = pdal.Pipeline(json % data)
count = pipeline.execute()


arr2 = pipeline.arrays[0]
print (len(arr)) 




NameError: name 'pdal' is not defined

In [85]:
data = "Armenia-lof.las"

json = """
    {
      "pipeline": [
        {
            "type": "readers.bpf",
            "filename": "%s"
        }
      ]
    }"""


pipeline = pdal.Pipeline(json % data)
count = pipeline.execute()


arr3 = pipeline.arrays[0]

In [5]:
data = "Armenia-eig.las"

json = """
    {
      "pipeline": [
        {
            "type": "readers.bpf",
            "filename": "%s"
        }
      ]
    }"""


pipeline = pdal.Pipeline(json % data)
count = pipeline.execute()

# get the data from the first array
# [array([(637012.24, 849028.31, 431.66, 143, 1,
# 1, 1, 0, 1,  -9., 132, 7326, 245380.78254963,  68,  77,  88),
# dtype=[('X', '<f8'), ('Y', '<f8'), ('Z', '<f8'), ('Intensity', '<u2'),
# ('ReturnNumber', 'u1'), ('NumberOfReturns', 'u1'), ('ScanDirectionFlag', 'u1'),
# ('EdgeOfFlightLine', 'u1'), ('Classification', 'u1'), ('ScanAngleRank', '<f4'),
# ('UserData', 'u1'), ('PointSourceId', '<u2'),
# ('GpsTime', '<f8'), ('Red', '<u2'), ('Green', '<u2'), ('Blue', '<u2')])

arr = pipeline.arrays[0]

In [7]:
df=pd.DataFrame(arr)
df2=pd.DataFrame(arr2)

In [88]:
df3 =pd.DataFrame(arr3)

In [90]:
df3.drop(['Eigenvalue0','Eigenvalue1','Eigenvalue2', 'X','Y','Z'],axis=1,inplace=True)

In [91]:
df3.head(5435)

Unnamed: 0,KDistance,LocalReachabilityDistance,LocalOutlierFactor
0,0.094340,10.589352,1.006060
1,0.096436,10.480013,0.999812
2,0.100499,10.008513,1.002799
3,0.103440,9.769090,1.001474
4,0.104881,9.575985,1.001738
...,...,...,...
5430,0.050990,19.885212,0.995348
5431,0.050990,19.903202,0.997133
5432,0.050990,19.925945,0.997298
5433,0.050990,19.938526,0.998311


In [10]:
df2.head(5435)

Unnamed: 0,X,Y,Z,Linearity,Planarity,Scattering,Verticality
0,3.93,-6.60,6.10,0.038870,0.887232,0.073898,0.197579
1,3.83,-6.57,6.06,0.014761,0.908272,0.076967,0.315298
2,3.71,-6.53,5.98,0.026313,0.898759,0.074928,0.455025
3,3.64,-6.51,5.92,0.044468,0.881234,0.074299,0.491115
4,3.56,-6.48,5.84,0.017813,0.912783,0.069403,0.548436
...,...,...,...,...,...,...,...
5430,-7.04,-2.80,3.02,0.048415,0.784589,0.166996,0.737967
5431,-7.04,-2.80,2.99,0.046067,0.805779,0.148153,0.727064
5432,-7.04,-2.80,2.96,0.020804,0.844660,0.134537,0.700453
5433,-7.04,-2.80,2.93,0.022390,0.855015,0.122595,0.713039


## DIMENSIONER FUNCTION

See pipeline outline above

In [33]:
def dimensioner(row):
    if (row.Linearity > row.Scattering)&(row.Linearity > row.Planarity):
        return 0
    elif (row.Planarity > row.Scattering)&(row.Planarity > row.Linearity):
        return 1
    elif (row.Scattering > row.Planarity)&(row.Scattering > row.Linearity):
        return 2

In [35]:
mini_df= df2.head(200).copy()

In [36]:
mini_df['dimensionality'] = mini_df.apply(dimensioner, axis = 1)

In [32]:
(df2.Scattering > df2.Linearity).value_counts()

True     54202503
False     5027775
dtype: int64

In [40]:
mini_df[mini_df.dimensionality == 2].head()

Unnamed: 0,X,Y,Z,Linearity,Planarity,Scattering,Verticality,dimensionality
18,3.18,-6.35,4.32,0.355959,0.278402,0.365639,0.649742,2
55,1.65,-5.84,6.69,0.360102,0.190086,0.449812,0.61973,2
56,1.6,-5.82,6.75,0.074554,0.38916,0.536286,0.45094,2
190,-1.86,-4.65,6.79,0.164934,0.402715,0.432351,0.302241,2
191,-1.87,-4.65,6.78,0.182455,0.336449,0.481096,0.392991,2


In [44]:
mdf = pd.concat([df, df2.Linearity, df2.Planarity, df2.Scattering,df2.Verticality], axis = 1)

In [92]:
mdf=pd.concat([mdf, df3],axis=1)

In [93]:
mdf.describe()

Unnamed: 0,X,Y,Z,Eigenvalue0,Eigenvalue1,Eigenvalue2,Linearity,Planarity,Scattering,Verticality,dimensionality,KDistance,LocalReachabilityDistance,LocalOutlierFactor
count,59230280.0,59230280.0,59230280.0,59230280.0,59230280.0,59230280.0,59230280.0,59230280.0,59230280.0,59230280.0,59230280.0,59230280.0,59230280.0,59230280.0
mean,-0.3636104,0.6425691,2.924674,4.123769e-05,0.0007099464,0.0008860917,0.07621289,0.7373036,0.1864857,0.5494817,1.026142,0.05483446,19.36222,1.005634
std,4.689966,5.091767,2.90033,0.0002379863,0.0009485055,0.002511733,0.07156322,0.140642,0.09778208,0.2386656,0.2005919,0.02147392,3.759348,0.07447424
min,-7.5,-7.9,-1.5,0.0,6.911044e-05,0.0002178151,3.061334e-06,0.0002499594,0.0,0.0,0.0,0.03162252,0.4453758,0.9208325
25%,-4.4,-3.44,0.22,1.188882e-05,0.0004564159,0.0005322055,0.03378503,0.6971194,0.1307108,0.4272484,1.0,0.04472142,17.07607,0.996493
50%,-0.71,1.17,3.24,1.906704e-05,0.0005757517,0.0006744746,0.05663759,0.7717204,0.1680613,0.6714627,1.0,0.05099019,19.61972,1.00032
75%,3.57,5.2,5.49,3.101408e-05,0.0007622153,0.000883531,0.09161971,0.8221328,0.2190075,0.7116015,1.0,0.05916006,22.12029,1.005761
max,7.45,8.5,12.92,0.07859562,0.5565488,1.205022,0.9200419,0.999997,0.9765936,0.9992843,2.0,2.318297,31.15811,23.82281


In [50]:
# DO NOT RUN THIS AGAIN

mdf['dimensionality'] = mdf.apply(dimensioner, axis=1)

In [94]:
mdf.to_csv('thisisit.csv')

## SAMPLING FOR MODEL BUILDING

see pipeline outline above

In [95]:
mdf.dimensionality.value_counts()

1    56806545
2     1986054
0      437679
Name: dimensionality, dtype: int64

In [97]:
zer_df = mdf[mdf['dimensionality']==0].copy()

In [98]:
one_df = mdf[mdf['dimensionality']==1].copy()

In [99]:
two_df = mdf[mdf['dimensionality']==2].copy()

In [100]:
one_df = one_df.sample(437679)
two_df = two_df.sample(437679)

In [101]:
mini_df = pd.concat([zer_df,one_df,two_df], ignore_index = True)

In [102]:
mini_df['dimensionality'].value_counts()

2    437679
1    437679
0    437679
Name: dimensionality, dtype: int64

In [103]:
mini_df.to_csv('mini_df_resampled2.csv')