<a href="https://colab.research.google.com/github/laylam02/Wildfire-Group-Project/blob/main/Wildfire_Group_Project_Saved.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Obtaining the Data

In [None]:
!wget https://media.githubusercontent.com/media/ulissigroup/F22-06-325/main/f22-06-325/projects/wildfires/data/BlodgettCombinedBlobTable.csv

--2022-10-03 18:45:28--  https://media.githubusercontent.com/media/ulissigroup/F22-06-325/main/f22-06-325/projects/wildfires/data/BlodgettCombinedBlobTable.csv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7679879 (7.3M) [text/plain]
Saving to: ‘BlodgettCombinedBlobTable.csv’


2022-10-03 18:45:29 (37.4 MB/s) - ‘BlodgettCombinedBlobTable.csv’ saved [7679879/7679879]



# Loading Data Set

In [None]:
import pandas as pd
import numpy as np

# define column names
col_names = ["Unused tags 1", "BlobID1", "Unused tags 2", 
            "1D Retention Time (min)", "2D Retention Time (sec)", 
            "Peak Height", "Peak Volume", "Peak volume/nearest internal standard peak volume", 
            "Calculated d-alkane retention index", "matched retention index", 
            "Unused tags 3", "Unused tags 4", "Unused tags 5", 
            "BlobID_2", "Filter number", "Unused tags 6", 
            "Mass concentration of compound (ng/m3)"]

# import csv file
df_blobtable = pd.read_csv("BlodgettCombinedBlobTable.csv", names = col_names)

unusedtags = ["Unused tags 1", "Unused tags 2", "Unused tags 3", 
                "Unused tags 4", "Unused tags 5", "Unused tags 6"]

df_blobtable.replace(np.inf, np.nan, inplace = True)
df_blobtable = df_blobtable.drop(labels = unusedtags, axis = 1)
df_blobtable = df_blobtable.dropna()

# Creating a Binary Representation
This binary representation assigns 1 when the elevation is above-ground, and 0 when the elevation is on the ground. We initially used this as a simple means of determining and verifying if the drones were on ground or not.



In [None]:
# create a binary representation of if the drones are elevated or on the ground
binary = []

# convert the column to an array
filters = np.array(df_blobtable['Filter number'])

for filter in filters:
  if filter >= 200:
    binary.append(0)
  else:
    binary.append(1)

# create a new column
df_blobtable['Binary Elevation'] = binary

# Naive Model


In [None]:
# naive model
most_common_filter = df_blobtable['Filter number'].mode()[0]

naive = 0
for i in range(len(df_blobtable['Filter number'])):
    if df_blobtable['Filter number'][i] == most_common_filter:
        naive += 1
    else:
        naive += 0

# calculating the accuracy
n_accuracy = naive/sum(binary)
print('The accuracy for the naive model is:', round(n_accuracy, 3))

KeyError: ignored

# Logistic Regression

In [None]:
# logistic model
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go

x = df_blobtable[(["Peak Height", "Peak Volume", "Mass concentration of compound (ng/m3)"])]
y = df_blobtable[("Binary Elevation")]

# performing a 80/10/10 split
x_train, x_testval, y_train, y_testval, = train_test_split(
x, y, train_size = 8/10, random_state = 42, shuffle = True)

x_val, x_test, y_val, y_test = train_test_split(
x_testval, y_testval, test_size = 5/10, random_state = 42, shuffle = True)

# simple logistic regression
logistic_regression = make_pipeline(StandardScaler(), LogisticRegression(penalty = "none"))
log_fit = logistic_regression.fit(x_train, y_train)
accuracy = log_fit.score(x_val, y_val)
print('The accuracy for the logistic regression is:', round(accuracy, 3))

# making the parity plot
fig = go.Figure(data = [go.Scatter(x = y_val, y = log_fit.predict(x_val), mode = "markers")])

fig.add_shape(type = "line", x0 = y_val.min(), y0 = y_val.min(), x1 = y_val.max(), y1 = y_val.max())

fig.update_xaxes(title_text = "Actual Value")
fig.update_yaxes(title_text = "Predicted Value")

# set the plot size
fig.update_layout(title = 'Parity Plot for Logistic Regression', autosize = False, width = 400, height = 400)
fig.show()

The accuracy for the logistic regression is: 0.643


# Default Decision Tree Regressor

In [None]:
# default decision tree regressor
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor()

dt_model = dt_model.fit(x_train, y_train)
accuracy = dt_model.score(x_val, y_val)
print('The accuracy for the default decision tree regressor is:', round(accuracy, 3))

# making the parity plot
fig = go.Figure(data = [go.Scatter(x = y_val, y = dt_model.predict(x_val), mode = "markers")])

fig.add_shape(type = "line", x0 = y_val.min(), y0 = y_val.min(), x1 = y_val.max(), y1 = y_val.max(),)

fig.update_xaxes(title_text = "Actual Value")
fig.update_yaxes(title_text = "Predicted Value")

# set the plot size
fig.update_layout(title = 'Parity Plot for Decision Tree Model', autosize = False, width = 400, height = 400)
fig.show()

The accuracy for the default decision tree regressor is: -0.839


# Creating an Altitude Representation
We are creating an extra column in the dataaframe for altitude instead of a binary representation. The correlating altitudes are found using the Filters vs. forest plot number.xlsx.

In [None]:
# create a new altitude column for if the drones are elevated or on the ground
alt = []

# convert the column to an array
filters = np.array(df_blobtable['Filter number'])

for filter in filters:
  if filter == 1:
    alt.append(50)
  elif filter == 2:
    alt.append(53)
  elif filter == 3:
    alt.append(41)
  elif filter == 4:
    alt.append(14)
  elif filter == 5:
    alt.append(50)
  elif filter == 6:
    alt.append(40)
  elif filter == 7:
    alt.append(32)
  elif filter == 8:
    alt.append(35)
  elif filter == 9:
    alt.append(50)
  elif filter == 10:
    alt.append(20)
  elif filter == 11:
    alt.append(35)
  elif filter == 12:
    alt.append(30)
  elif filter == 13:
    alt.append(20)
  elif filter == 14:
    alt.append(70)
  elif filter == 15:
    alt.append(95)
  elif filter == 16:
    alt.append(100)
  elif filter == 17:
    alt.append(100)
  elif filter == 18:
    alt.append(72)
  elif filter == 19:
    alt.append(70)
  elif filter == 20:
    alt.append(60)
  elif filter == 21:
    alt.append(60)
  elif filter == 22:
    alt.append(60)
  else:
    alt.append(0)

# create a new column
df_blobtable['Altitude'] = alt

df_blobtable

Unnamed: 0,BlobID1,1D Retention Time (min),2D Retention Time (sec),Peak Height,Peak Volume,Peak volume/nearest internal standard peak volume,Calculated d-alkane retention index,matched retention index,BlobID_2,Filter number,Mass concentration of compound (ng/m3),Binary Elevation,Altitude
0,181,40.608204,1.026330,105.022469,1263.347317,1.000000,1652.439024,1653.0,0,201,0.000000,0,0
1,1553,40.037744,1.414941,31.052483,479.394947,1.000000,1634.146341,1633.0,0,201,0.000000,0,0
2,62,63.502673,1.135938,8.124082,135.140907,1.000000,2524.561404,2520.0,0,201,0.000000,0,0
3,776,27.259436,1.355154,281.546460,3376.619472,1.000000,1263.387978,1277.0,0,201,0.000000,0,0
4,61,32.013271,0.328824,118.679093,946.665804,1.000000,1400.000000,1800.0,0,201,0.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
44635,2811,40.312724,0.981316,4.096229,57.013969,0.019236,1644.171779,1646.0,176,9,0.783983,1,50
44636,2812,40.274506,1.131518,155.203347,1839.028248,0.620461,1642.944785,1642.0,176,9,160.803335,1,50
44637,2813,40.045199,0.891195,201.872901,2958.995344,0.998322,1635.582822,1637.0,176,9,24.473803,1,50
44638,2818,45.892543,0.931249,6.134070,124.792845,0.043840,1825.675676,1824.0,176,9,1.330786,1,50


In [None]:
x = df_blobtable[(["Peak Height", "Peak Volume", "Mass concentration of compound (ng/m3)"])]
y = df_blobtable[("Altitude")]

# performing a 80/10/10 split
x_train, x_testval, y_train, y_testval, = train_test_split(
x, y, train_size = 8/10, random_state = 42, shuffle = True)

x_val, x_test, y_val, y_test = train_test_split(
x_testval, y_testval, test_size = 5/10, random_state = 42, shuffle = True)

# Loading filter height data

In [None]:
# default decision tree regressor
dt_model = DecisionTreeRegressor()

dt_model = dt_model.fit(x_train, y_train)
accuracy = dt_model.score(x_val, y_val)
print('The accuracy for the default decision tree regressor is:', round(accuracy, 3))

# making the parity plot
fig = go.Figure(data = [go.Scatter(x = y_val, y = dt_model.predict(x_val), mode = "markers")])

fig.add_shape(type = "line", x0 = y_val.min(), y0 = y_val.min(), x1 = y_val.max(), y1 = y_val.max(),)

fig.update_xaxes(title_text = "Actual Value")
fig.update_yaxes(title_text = "Predicted Value")

# set the plot size
fig.update_layout(title = 'Parity Plot for Decision Tree Model', autosize = False, width = 400, height = 400)
fig.show()

The accuracy for the defaul decision tree regressor is: -0.884


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

linear_reg_model = LinearRegression()
linear_reg_model = make_pipeline(PolynomialFeatures(4), LinearRegression(fit_intercept = False))
linear_reg_model.fit(x_train, y_train)

accuracy = linear_reg_model.score(x_val, y_val)

mae = mean_absolute_error(y_val, linear_reg_model.predict(x_val))

print('The accuracy for the linear regression model is', round(accuracy, 3))
print('The MAE for the linear regression model is', round(mae, 3))

# making the parity plot
fig = go.Figure(data = [go.Scatter(x = y_val, y = linear_reg_model.predict(x_val), mode = "markers")])

fig.add_shape(type = "line", x0 = y_val.min(), y0 = y_val.min(), x1 = y_val.max(), y1 = y_val.max(),)

fig.update_xaxes(title_text = "Actual Value")
fig.update_yaxes(title_text = "Predicted Value")

# set the plot size
fig.update_layout(title = 'Parity Plot for Linear Regression', autosize = False, width = 400, height = 400)
fig.show()

The accuracy for the linear regression model is -220.737
The MAE for the linear regression model is 43.567


In [None]:
df_filter_flight = pd.read_excel("Run_Log.xlsx", sheet_name="Flight Log")

df_filter_flight