# accessing data

In [None]:
#!ls
#!pip install pandas
#!pip install scikit-learn

[30m[47m1_data_process.ipynb[m[m   [30m[47m3_neural_network.ipynb[m[m
[30m[47m2_data_explore.ipynb[m[m   [30m[47mREADME.md[m[m


In [3]:
import pandas as pd

df = pd.read_csv("/Users/alexanderfeil/Desktop/studies/2 Master/2 Fall Term/machine learning/project/FPA-FOD_reduced.csv", low_memory = False)
df = df.drop('Unnamed: 0', axis = 'columns') #drops the items where columns are unnamed


# Processing data

In [4]:
unknown_mask = df['NWCG_GENERAL_CAUSE'] == "Missing data/not specified/undetermined"

df_known = df[~unknown_mask].copy()
df_unknown = df[unknown_mask].copy()

In [5]:
X = df_known.drop(columns=['NWCG_GENERAL_CAUSE'])
y = df_known['NWCG_GENERAL_CAUSE']

In [6]:
X = X.fillna(X.median(numeric_only=True)) #replace NANs

for col in X.columns: #convert numeric values to floats, if possible
    if X[col].dtype == 'object':
        # try converting to float — if it works, keep it numeric
        try:
            X[col] = X[col].astype(float)
        except ValueError:
            pass


In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y) #encode the textual output

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

numeric_features = ['pr', 'tmmn', 'tmmx', 'rmin', 'rmax', 'sph', 'srad', 'etr', 'vpd',
                    'bi', 'erc', 'fm100', 'fm1000', 'EVC', 'EVT', 'EVH',
                    'Elevation', 'Slope', 'Aspect', 'TRI', 'TPI', 'Aridity_index',
                    'Population', 'GDP', 'LATITUDE', 'LONGITUDE', 'FIRE_YEAR','DISCOVERY_DOY', 'DISCOVERY_TIME']

categorical_features = ['STATE', 'COUNTY']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split train test

In [9]:
from sklearn.model_selection import train_test_split

# Combine X and y into one DataFrame temporarily
df2 = X.copy()
df2['cause'] = y_encoded

df_balanced = (
    df2.groupby('cause', group_keys=False)
      .apply(lambda x: x.sample(n=min(len(x), 10000), random_state=42))
      .reset_index(drop=True)
)

# Split back into X and y
X_balanced = df_balanced.drop(columns=['cause'])
y_balanced = df_balanced['cause']

# Now do your train-test split as usual
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

print(X_train_df.shape, X_test_df.shape)
print(y_train_df.value_counts())

  df2.groupby('cause', group_keys=False)


(90186, 32) (22547, 32)
cause
11    8000
6     8000
10    8000
8     8000
0     8000
7     8000
2     8000
4     8000
1     8000
5     8000
9     8000
3     2186
Name: count, dtype: int64


In [10]:
cause_names = [
    "Debris and open burning",
    "Misuse of fire by a minor",
    "Equipment and vehicle use",
    "Fireworks",
    "Arson/incendiarism",
    "Power generation/transmission/distribution",
    "Railroad operations and maintenance",
    "Recreation and ceremony",
    "Natural",
    "Smoking",
    "Other causes",
    "Firearms and explosives use"
]

# Convert y_train value counts into a DataFrame for easier mapping
counts = y_train_df.value_counts().reset_index()
counts.columns = ['cause_id', 'count']

# Map encoded ID to name
counts['cause_name'] = counts['cause_id'].map(lambda i: cause_names[i])

print(counts[['cause_id', 'cause_name', 'count']].to_string(index=False))

 cause_id                                 cause_name  count
       11                Firearms and explosives use   8000
        6        Railroad operations and maintenance   8000
       10                               Other causes   8000
        8                                    Natural   8000
        0                    Debris and open burning   8000
        7                    Recreation and ceremony   8000
        2                  Equipment and vehicle use   8000
        4                         Arson/incendiarism   8000
        1                  Misuse of fire by a minor   8000
        5 Power generation/transmission/distribution   8000
        9                                    Smoking   8000
        3                                  Fireworks   2186


In [12]:
X_train_df.head()

Unnamed: 0,pr,tmmn,tmmx,rmin,rmax,sph,srad,etr,vpd,bi,...,Aridity_index,Population,GDP,LATITUDE,LONGITUDE,STATE,COUNTY,FIRE_YEAR,DISCOVERY_DOY,DISCOVERY_TIME
112173,0.0,289.299988,305.600006,34.5,78.099998,0.01047,279.0,5.9,1.68,22.0,...,0.75,0.038,29129.756,33.172944,-89.923202,MS,Holmes,1995,250,1400.0
104451,0.0,287.600006,303.399994,32.100002,95.700005,0.0101,256.300018,6.7,1.35,32.0,...,0.73,0.0,67082.42,39.77,-74.679,NJ,Burlington,2015,249,2300.0
57302,3.6,295.299988,308.600006,38.400002,100.0,0.01639,357.899994,8.2,1.56,0.0,...,0.42,0.0,50895.01,29.82139,-81.52,FL,St. Johns,2006,217,1600.0
100210,0.0,273.200012,286.600006,24.1,63.200001,0.00276,216.400009,3.8,0.67,33.0,...,0.26,0.0042,45981.348,47.6478,-114.3456,MT,,2020,82,2200.0
74912,0.0,274.600006,283.399994,36.400002,66.200005,0.0031,258.899994,5.3,0.49,59.0,...,0.78,0.015,42784.63,46.26215,-89.78343,WI,Vilas,2010,106,1342.0


In [13]:
y_train_df.head()

112173    11
104451    11
57302      6
100210    10
74912      8
Name: cause, dtype: int64

In [19]:
# Turn tables into CSV files

# X_train_df.to_csv("X_train_df.csv", index=False)
# y_train_df.to_csv("y_train_df.csv", index=False)
# X_test_df.to_csv("X_test_df.csv", index=False)
# y_test_df.to_csv("y_test_df.csv", index=False)

# load CSV files
X_train_df = pd.read_csv("X_train_df.csv")
y_train_df = pd.read_csv("y_train_df.csv")
X_test_df = pd.read_csv("X_test_df.csv")
y_test_df = pd.read_csv("y_test_df.csv")

### Standardizazion

In [23]:
# Standardization
X_train_df = X_train_df.select_dtypes(include=['number'])
y_train_df = y_train_df.select_dtypes(include=['number'])
X_test_df = X_test_df.select_dtypes(include=['number'])
X_train_df = X_train_df.select_dtypes(include=['number'])

mean, std = X_train_df.mean(), X_train_df.std()

X_train_df   = (X_train_df - mean)/std
X_test_df    = (X_test_df - mean)/std

X_train = X_train_df.to_numpy()
y_train = y_train_df.to_numpy()

X_test = X_test_df.to_numpy()


In [None]:
# set hyperparameters
n_neuron       = 64
activation     = 'Tanh' #'ReLU' #'Linear', 'Tanh'
num_epochs     = 50g
learning_rate  = 0.001
minibatch_size = 64
N_layers       = 2 # number of hidden layers