In [9]:
import pandas as pd

df = pd.read_csv("FPA-FOD_reduced.csv", low_memory = False)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,pr,tmmn,tmmx,rmin,rmax,sph,srad,etr,vpd,...,Population,GDP,LATITUDE,LONGITUDE,STATE,COUNTY,FIRE_YEAR,DISCOVERY_DOY,DISCOVERY_TIME,NWCG_GENERAL_CAUSE
0,0,,,,,,,,,,...,0.0,51481.17,21.574287,-158.10704,HI,Oahu,2007,1,,Missing data/not specified/undetermined
1,1,0.0,269.299988,281.799988,36.700001,92.099998,0.00282,129.100006,2.2,0.35,...,0.0165,47810.75,32.36083,-98.08985,TX,Erath,2007,1,,Debris and open burning
2,2,0.0,276.5,287.299988,41.799999,90.0,0.00448,132.900009,3.6,0.47,...,2.6456,55688.996,36.487061,-121.934647,CA,Monterey,2007,1,1537.0,Misuse of fire by a minor
3,3,0.0,275.100006,286.700012,23.9,75.200005,0.00323,144.300003,3.1,0.61,...,1.1297,47810.75,28.89387,-98.53885,TX,Atascosa,2007,1,,Equipment and vehicle use
4,4,0.9,288.299988,300.299988,49.0,98.200005,0.01116,109.5,4.5,0.85,...,0.1981,50591.914,27.50861,-80.75861,FL,Okeechobee,2007,1,1022.0,Fireworks


In [10]:
df = df.drop('Unnamed: 0', axis = 'columns')

In [11]:
df.columns

Index(['pr', 'tmmn', 'tmmx', 'rmin', 'rmax', 'sph', 'srad', 'etr', 'vpd', 'bi',
       'erc', 'fm100', 'fm1000', 'NDVI_mean', 'EVC', 'EVT', 'EVH', 'Elevation',
       'Slope', 'Aspect', 'TRI', 'TPI', 'Aridity_index', 'Population', 'GDP',
       'LATITUDE', 'LONGITUDE', 'STATE', 'COUNTY', 'FIRE_YEAR',
       'DISCOVERY_DOY', 'DISCOVERY_TIME', 'NWCG_GENERAL_CAUSE'],
      dtype='object')

In [13]:
print("Causes:")

for cause in df["NWCG_GENERAL_CAUSE"].unique():
    print("\t", cause)

Causes:
	 Missing data/not specified/undetermined
	 Debris and open burning
	 Misuse of fire by a minor
	 Equipment and vehicle use
	 Fireworks
	 Arson/incendiarism
	 Power generation/transmission/distribution
	 Railroad operations and maintenance
	 Recreation and ceremony
	 Natural
	 Smoking
	 Other causes
	 Firearms and explosives use


# Processing data

In [12]:
unknown_mask = df['NWCG_GENERAL_CAUSE'] == "Missing data/not specified/undetermined"

df_known = df[~unknown_mask].copy()
df_unknown = df[unknown_mask].copy()

In [13]:
X = df_known.drop(columns=['NWCG_GENERAL_CAUSE'])
y = df_known['NWCG_GENERAL_CAUSE']

In [20]:
X = X.fillna(X.median(numeric_only=True)) #replace NANs

for col in X.columns: #convert numeric values to floats, if possible
    if X[col].dtype == 'object':
        # try converting to float — if it works, keep it numeric
        try:
            X[col] = X[col].astype(float)
        except ValueError:
            pass


In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y) #encode the textual output

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

numeric_features = ['pr', 'tmmn', 'tmmx', 'rmin', 'rmax', 'sph', 'srad', 'etr', 'vpd',
                    'bi', 'erc', 'fm100', 'fm1000', 'NDVI_mean', 'EVC', 'EVT', 'EVH',
                    'Elevation', 'Slope', 'Aspect', 'TRI', 'TPI', 'Aridity_index',
                    'Population', 'GDP', 'LATITUDE', 'LONGITUDE', 'FIRE_YEAR','DISCOVERY_DOY', 'DISCOVERY_TIME']

categorical_features = ['STATE', 'COUNTY']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='median'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [28]:
import ast

def fix_ndvi(cell):
    if isinstance(cell, str):
        try:
            arr = np.array(ast.literal_eval(cell.replace(' ', ',')))  # safely parse string to list
            return np.mean(arr)
        except Exception:
            return np.nan
    elif isinstance(cell, (list, np.ndarray)):
        return np.mean(cell)
    else:
        return cell

X['NDVI_mean'] = X['NDVI_mean'].apply(fix_ndvi) # replaces string of multiple numbers to the mean value in that string

# Model

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', rf)])

# Split train test

In [22]:
X.head()

Unnamed: 0,pr,tmmn,tmmx,rmin,rmax,sph,srad,etr,vpd,bi,...,Aridity_index,Population,GDP,LATITUDE,LONGITUDE,STATE,COUNTY,FIRE_YEAR,DISCOVERY_DOY,DISCOVERY_TIME
1,0.0,269.299988,281.799988,36.700001,92.099998,0.00282,129.100006,2.2,0.35,28.0,...,0.58,0.0165,47810.75,32.36083,-98.08985,TX,Erath,2007,1,1455.0
2,0.0,276.5,287.299988,41.799999,90.0,0.00448,132.900009,3.6,0.47,38.0,...,0.16,2.6456,55688.996,36.487061,-121.934647,CA,Monterey,2007,1,1537.0
3,0.0,275.100006,286.700012,23.9,75.200005,0.00323,144.300003,3.1,0.61,29.0,...,0.58,1.1297,47810.75,28.89387,-98.53885,TX,Atascosa,2007,1,1455.0
4,0.9,288.299988,300.299988,49.0,98.200005,0.01116,109.5,4.5,0.85,0.0,...,0.48,0.1981,50591.914,27.50861,-80.75861,FL,Okeechobee,2007,1,1022.0
6,0.0,269.600006,282.399994,29.1,79.700005,0.0025,136.900009,2.7,0.44,34.0,...,0.36,1.6947,47810.75,31.42414,-100.2925,TX,Tom Green,2007,1,1455.0


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

model.fit(X_train, y_train)



In [25]:
for col in X.columns:
    if X[col].dtype == 'object':
        print(col, X[col].unique()[:10])


NDVI_mean ["'0.25' '0.32' '0.35' '0.31' '0.26' '0.28' '0.36' '0.4' '0.31' '0.19' '0.16' '0.19'"
 "'0.28' '0.25' '0.39' '0.47' '0.53' '0.53' '0.49' '0.52' '0.32' '0.2' '0.32' '0.38'"
 "'0.13' '0.28' '0.26' '0.23' '0.23' '0.26' '0.24' '0.2' '0.21' '0.16' '0.14' '0.22'"
 "'0.18' '0.28' '0.31' '0.28' '0.22' '0.21' '0.25' '0.36' '0.29' '0.33' '0.31' '0.31'"
 "'0.18' '0.2' '0.23' '0.25' '0.19' '0.2' '0.2' '0.2' '0.2' '0.15' '0.16' '0.17'"
 "'0.14' '0.19' '0.2' '0.31' '0.45' '0.53' '0.4' '0.25' '0.2' '0.1' '0.17' '0.14'"
 "'0.25' '0.28' '0.37' '0.49' '0.55' '0.5' '0.49' '0.35' '0.18' '0.12' '0.3' '0.22'"
 "'0.31' '0.36' '0.38' '0.34' '0.41' '0.29' '0.3' '0.4' '0.38' '0.34' '0.26' '0.29'"
 "'0.23' '0.21' '0.39' '0.45' '0.48' '0.52' '0.46' '0.43' '0.24' '0.13' '0.33' '0.1'"
 "'0.11' '0.13' '0.15' '0.16' '0.19' '0.18' '0.19' '0.29' '0.32' '0.22' '0.28' '0.17'"]
STATE ['TX' 'CA' 'FL' 'NE' 'AZ' 'MN' 'UT' 'GA' 'AL' 'OK']
COUNTY ['Erath' 'Monterey' 'Atascosa' 'Okeechobee' 'Tom Green' nan 'Fresno'
 '

In [27]:
X['NDVI_mean'].iloc[0]

"'0.25' '0.32' '0.35' '0.31' '0.26' '0.28' '0.36' '0.4' '0.31' '0.19' '0.16' '0.19'"