# ML - Fire and Weather Data 2008 - 2020

# Import and Cleaning Data

In [1]:
# Initial Import of Dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import psycopg2 as pg
from collections import Counter

In [2]:
# External Database Connection
engine = pg.connect("dbname='d3r8dfuncb78iv' user='jrufhfiejfajri' host='ec2-52-200-155-213.compute-1.amazonaws.com' port='5432' password='9a7254d2151b5e3c280fe275dbba039acdc9190fbc167f64c564c449ca77af88'")
fire_df = pd.read_sql('select * from fw_combined_avgs', con=engine)

In [3]:
# Check Data Types
fire_df.dtypes

fire_year          int64
district          object
unit              object
fire_number       object
fire_name         object
legal             object
latitude          object
longitude         object
fuel_model        object
county            object
report_date       object
general_cause     object
odf_acres        float64
total_acres      float64
prcp_avg         float64
snow_avg         float64
snwd_avg         float64
tmax_avg         float64
tmin_avg         float64
dtype: object

In [4]:
# Filter fire_year to match weather data 2008 - 2020
fire_df = fire_df[(fire_df['fire_year'] >= 2008) & (fire_df['fire_year'] <= 2020)]


In [5]:
# Confirm Count Matches Fire_weather dataset 
fire_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12577 entries, 0 to 12576
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fire_year      12577 non-null  int64  
 1   district       12577 non-null  object 
 2   unit           12577 non-null  object 
 3   fire_number    12577 non-null  object 
 4   fire_name      12577 non-null  object 
 5   legal          12573 non-null  object 
 6   latitude       12569 non-null  object 
 7   longitude      12569 non-null  object 
 8   fuel_model     12511 non-null  object 
 9   county         12574 non-null  object 
 10  report_date    12577 non-null  object 
 11  general_cause  12577 non-null  object 
 12  odf_acres      12577 non-null  float64
 13  total_acres    12515 non-null  float64
 14  prcp_avg       10829 non-null  float64
 15  snow_avg       10582 non-null  float64
 16  snwd_avg       10598 non-null  float64
 17  tmax_avg       10540 non-null  float64
 18  tmin_a

In [6]:
#. Check for null values
fire_df.isna().sum()

fire_year           0
district            0
unit                0
fire_number         0
fire_name           0
legal               4
latitude            8
longitude           8
fuel_model         66
county              3
report_date         0
general_cause       0
odf_acres           0
total_acres        62
prcp_avg         1748
snow_avg         1995
snwd_avg         1979
tmax_avg         2037
tmin_avg         2037
dtype: int64

In [7]:
# Converting Lat/Long DMS to Decimal
pattern = r'(?P<d>[\d\.]+).*?(?P<m>[\d\.]+).*?(?P<s>[\d\.]+)'

#Lat
dms = fire_df['latitude'].str.extract(pattern).astype(float)
fire_df['Latitude'] = dms['d'] + dms['m'].div(60) + dms['s'].div(3600)

#Long  
dms = fire_df['longitude'].str.extract(pattern).astype(float)
fire_df['Longitude'] = dms['d'] + dms['m'].div(60) + dms['s'].div(3600)

# Convert all Longtiude to Negative
fire_df['Longitude'] *= -1

fire_df.head(5)

Unnamed: 0,fire_year,district,unit,fire_number,fire_name,legal,latitude,longitude,fuel_model,county,...,general_cause,odf_acres,total_acres,prcp_avg,snow_avg,snwd_avg,tmax_avg,tmin_avg,Latitude,Longitude
0,2008,77 - South Cascade,Sweet Home,123,Bellinger Road,T12S R01W S14 SWNE,44° 31' 40.01,-122° 46' 12,L,LINN,...,Debris Burning,4.0,4.0,,,,,,44.527781,-122.77
1,2008,55 - West Oregon,Dallas,35,SHADY LANE NORTH,T09S R06W S01 NWSW,44° 48' 52.99,-123° 22' 35,L,POLK,...,Debris Burning,3.0,3.0,0.003077,0.0,20.0,56.75,38.0,44.814719,-123.376389
2,2008,72 - Coos,Bridge,41,Hwy 242 MP 17.2,T31S R12W S11 SENE,42° 53' 49.99,-124° 4' 59.02,F,COOS,...,Recreationist,0.0,0.01,0.001538,0.0,0.0,62.833333,40.5,42.897219,-124.083061
3,2008,95 - Central Oregon,Sisters,62,HWY 126,T15S R10E S10 SWNW,44° 17' 12.01,-121° 31' 45.98,L,DESCHUTES,...,Debris Burning,1.0,1.0,,,,,,44.286669,-121.529439
4,2008,73 - Douglas,South,208,Dads Union,T31S R08W S02 SWNE,42° 54' 23,-123° 36' 46.01,I,DOUGLAS,...,Recreationist,0.01,0.01,,,,,,42.906389,-123.612781


In [8]:
# Drop DMS Lat/Long/odf_acres
fire_df = fire_df.drop(['latitude', 'longitude', "odf_acres"], axis=1)
# Confirm Dropped Columns
fire_df.head(5)

Unnamed: 0,fire_year,district,unit,fire_number,fire_name,legal,fuel_model,county,report_date,general_cause,total_acres,prcp_avg,snow_avg,snwd_avg,tmax_avg,tmin_avg,Latitude,Longitude
0,2008,77 - South Cascade,Sweet Home,123,Bellinger Road,T12S R01W S14 SWNE,L,LINN,2008-02-15,Debris Burning,4.0,,,,,,44.527781,-122.77
1,2008,55 - West Oregon,Dallas,35,SHADY LANE NORTH,T09S R06W S01 NWSW,L,POLK,2008-02-27,Debris Burning,3.0,0.003077,0.0,20.0,56.75,38.0,44.814719,-123.376389
2,2008,72 - Coos,Bridge,41,Hwy 242 MP 17.2,T31S R12W S11 SENE,F,COOS,2008-03-09,Recreationist,0.01,0.001538,0.0,0.0,62.833333,40.5,42.897219,-124.083061
3,2008,95 - Central Oregon,Sisters,62,HWY 126,T15S R10E S10 SWNW,L,DESCHUTES,2008-03-10,Debris Burning,1.0,,,,,,44.286669,-121.529439
4,2008,73 - Douglas,South,208,Dads Union,T31S R08W S02 SWNE,I,DOUGLAS,2008-04-05,Recreationist,0.01,,,,,,42.906389,-123.612781


In [9]:
# Drop the null columns where all values are null
fire_df = fire_df.dropna(axis='columns', how='all')

# Drop the null rows
fire_df = fire_df.dropna()
fire_df.head(5)

Unnamed: 0,fire_year,district,unit,fire_number,fire_name,legal,fuel_model,county,report_date,general_cause,total_acres,prcp_avg,snow_avg,snwd_avg,tmax_avg,tmin_avg,Latitude,Longitude
1,2008,55 - West Oregon,Dallas,35,SHADY LANE NORTH,T09S R06W S01 NWSW,L,POLK,2008-02-27,Debris Burning,3.0,0.003077,0.0,20.0,56.75,38.0,44.814719,-123.376389
2,2008,72 - Coos,Bridge,41,Hwy 242 MP 17.2,T31S R12W S11 SENE,F,COOS,2008-03-09,Recreationist,0.01,0.001538,0.0,0.0,62.833333,40.5,42.897219,-124.083061
5,2008,51 - Tillamook,Tillamook,33,LAKE LYTLE,T02N R10W S29 NWSE,H,TILLAMOOK,2008-04-05,Juveniles,0.01,0.514,0.0,0.0,45.5,36.833333,45.628061,-123.933611
8,2008,71 - Southwest,Grants Pass,319,Westside Rd. 6880,T40S R08W S06 NWSE,H,JOSEPHINE,2008-04-12,Debris Burning,1.3,0.0,0.0,12.5,79.375,39.75,42.118061,-123.687781
10,2008,97 - Northeast Oregon,La Grande,49,Dingell Ford,T02N R41E S31 SESE,C,UNION,2008-04-12,Recreationist,0.01,0.0,0.0,35.714286,64.0,33.0,45.603331,-117.7275


In [10]:
fire_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10048 entries, 1 to 12576
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fire_year      10048 non-null  int64  
 1   district       10048 non-null  object 
 2   unit           10048 non-null  object 
 3   fire_number    10048 non-null  object 
 4   fire_name      10048 non-null  object 
 5   legal          10048 non-null  object 
 6   fuel_model     10048 non-null  object 
 7   county         10048 non-null  object 
 8   report_date    10048 non-null  object 
 9   general_cause  10048 non-null  object 
 10  total_acres    10048 non-null  float64
 11  prcp_avg       10048 non-null  float64
 12  snow_avg       10048 non-null  float64
 13  snwd_avg       10048 non-null  float64
 14  tmax_avg       10048 non-null  float64
 15  tmin_avg       10048 non-null  float64
 16  Latitude       10048 non-null  float64
 17  Longitude      10048 non-null  float64
dtypes: flo

In [11]:
# Import LE to convert Fuel Model and General Cause to Numeric Values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# Converting Fuel Model to Numeric
label_encoder = LabelEncoder()
fire_df["fueltype_num"] = label_encoder.fit_transform(fire_df["fuel_model"])

In [12]:
# Alternative Model with 7 CLasses - Set and Classify Fire Sizes - https://www.nwcg.gov/term/glossary/size-class-of-fire
#fire_df.loc[fire_df['total_acres'] <= .25, 'fire_severity'] = 1
#fire_df.loc[(fire_df['total_acres'] > .25) & (fire_df['total_acres'] <= 10), 'fire_severity'] = 2
#fire_df.loc[(fire_df['total_acres'] > 10) & (fire_df['total_acres'] <= 100), 'fire_severity'] = 3
#fire_df.loc[(fire_df['total_acres'] > 100) & (fire_df['total_acres'] <= 300), 'fire_severity'] = 4
#fire_df.loc[(fire_df['total_acres'] > 300) & (fire_df['total_acres'] <= 1000), 'fire_severity'] = 5
#fire_df.loc[(fire_df['total_acres'] > 1000) & (fire_df['total_acres'] <= 5000), 'fire_severity'] = 6
#fire_df.loc[fire_df['total_acres'] > 5000, 'fire_severity'] = 7                                                   

In [13]:
# Set and Classify Fire Sizes - https://www.nwcg.gov/term/glossary/size-class-of-fire
fire_df.loc[fire_df['total_acres'] <= .25, 'fire_severity'] = 1
fire_df.loc[(fire_df['total_acres'] > .25) & (fire_df['total_acres'] <= 300), 'fire_severity'] = 2
fire_df.loc[fire_df['total_acres'] > 300, 'fire_severity'] = 3          

In [14]:
# Loss in data from additional null fields in weather data
print(fire_df['fire_severity'].value_counts())

1.0    7191
2.0    2715
3.0     142
Name: fire_severity, dtype: int64


In [15]:
fire_df.general_cause.unique()

array(['Debris Burning', 'Recreationist', 'Juveniles', 'Equipment Use',
       'Miscellaneous', 'Smoking', 'Arson', 'Lightning', 'Railroad',
       'Under Invest'], dtype=object)

In [16]:
# Change General Cause Human or Nature

fire_df['general_cause'] = fire_df['general_cause'].replace(['Recreationist','Equipment Use','Debris Burning', 'Smoking', 'Arson', 'Railroad', 'Juveniles'],'1')
fire_df['general_cause'] = fire_df['general_cause'].replace(['Lightning'],'2')
fire_df['general_cause'] = fire_df['general_cause'].replace(['Under Invest', 'Miscellaneous'],'3')


#  Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [18]:
fire_binary_encoded = pd.get_dummies(fire_df, columns=["fuel_model", "general_cause"])
fire_binary_encoded.head()

Unnamed: 0,fire_year,district,unit,fire_number,fire_name,legal,county,report_date,total_acres,prcp_avg,...,fuel_model_J,fuel_model_K,fuel_model_L,fuel_model_R,fuel_model_T,fuel_model_U,fuel_model_X,general_cause_1,general_cause_2,general_cause_3
1,2008,55 - West Oregon,Dallas,35,SHADY LANE NORTH,T09S R06W S01 NWSW,POLK,2008-02-27,3.0,0.003077,...,0,0,1,0,0,0,0,1,0,0
2,2008,72 - Coos,Bridge,41,Hwy 242 MP 17.2,T31S R12W S11 SENE,COOS,2008-03-09,0.01,0.001538,...,0,0,0,0,0,0,0,1,0,0
5,2008,51 - Tillamook,Tillamook,33,LAKE LYTLE,T02N R10W S29 NWSE,TILLAMOOK,2008-04-05,0.01,0.514,...,0,0,0,0,0,0,0,1,0,0
8,2008,71 - Southwest,Grants Pass,319,Westside Rd. 6880,T40S R08W S06 NWSE,JOSEPHINE,2008-04-12,1.3,0.0,...,0,0,0,0,0,0,0,1,0,0
10,2008,97 - Northeast Oregon,La Grande,49,Dingell Ford,T02N R41E S31 SESE,UNION,2008-04-12,0.01,0.0,...,0,0,0,0,0,0,0,1,0,0


In [19]:
fire_binary_encoded = fire_binary_encoded.drop(['fueltype_num','district', 'unit', "fire_name", "legal", "report_date","fire_number", "county", "total_acres" ], axis=1)

In [20]:
fire_binary_encoded.head()

Unnamed: 0,fire_year,prcp_avg,snow_avg,snwd_avg,tmax_avg,tmin_avg,Latitude,Longitude,fire_severity,fuel_model_A,...,fuel_model_J,fuel_model_K,fuel_model_L,fuel_model_R,fuel_model_T,fuel_model_U,fuel_model_X,general_cause_1,general_cause_2,general_cause_3
1,2008,0.003077,0.0,20.0,56.75,38.0,44.814719,-123.376389,2.0,0,...,0,0,1,0,0,0,0,1,0,0
2,2008,0.001538,0.0,0.0,62.833333,40.5,42.897219,-124.083061,1.0,0,...,0,0,0,0,0,0,0,1,0,0
5,2008,0.514,0.0,0.0,45.5,36.833333,45.628061,-123.933611,1.0,0,...,0,0,0,0,0,0,0,1,0,0
8,2008,0.0,0.0,12.5,79.375,39.75,42.118061,-123.687781,2.0,0,...,0,0,0,0,0,0,0,1,0,0
10,2008,0.0,0.0,35.714286,64.0,33.0,45.603331,-117.7275,1.0,0,...,0,0,0,0,0,0,0,1,0,0


In [21]:
# Define the features set.
X = fire_binary_encoded
X = X.drop("fire_severity", axis=1)
X.head()

Unnamed: 0,fire_year,prcp_avg,snow_avg,snwd_avg,tmax_avg,tmin_avg,Latitude,Longitude,fuel_model_A,fuel_model_B,...,fuel_model_J,fuel_model_K,fuel_model_L,fuel_model_R,fuel_model_T,fuel_model_U,fuel_model_X,general_cause_1,general_cause_2,general_cause_3
1,2008,0.003077,0.0,20.0,56.75,38.0,44.814719,-123.376389,0,0,...,0,0,1,0,0,0,0,1,0,0
2,2008,0.001538,0.0,0.0,62.833333,40.5,42.897219,-124.083061,0,0,...,0,0,0,0,0,0,0,1,0,0
5,2008,0.514,0.0,0.0,45.5,36.833333,45.628061,-123.933611,0,0,...,0,0,0,0,0,0,0,1,0,0
8,2008,0.0,0.0,12.5,79.375,39.75,42.118061,-123.687781,0,0,...,0,0,0,0,0,0,0,1,0,0
10,2008,0.0,0.0,35.714286,64.0,33.0,45.603331,-117.7275,0,0,...,0,0,0,0,0,0,0,1,0,0


In [22]:
# Define the target set.
y = fire_binary_encoded["fire_severity"].ravel()
y[:5]

array([2., 1., 1., 2., 1.])

In [23]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [24]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [25]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=100, random_state=78) 

In [26]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [27]:
predictions = rf_model.predict(X_test_scaled)

In [28]:
predictions

array([1., 1., 1., ..., 1., 1., 1.])

In [29]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.08641834, 0.08124274, 0.00089707, 0.02264177, 0.14568435,
       0.15169648, 0.18032435, 0.18722791, 0.01431259, 0.00182649,
       0.01306641, 0.00886352, 0.00636343, 0.0129842 , 0.00521384,
       0.00781324, 0.00464381, 0.0124165 , 0.00522522, 0.00429784,
       0.00371953, 0.01518203, 0.01151776, 0.00823943, 0.00818112])

In [30]:
# Features Sorted by Importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.18722790883915472, 'Longitude'),
 (0.18032434955584034, 'Latitude'),
 (0.15169647793004584, 'tmin_avg'),
 (0.14568435451144446, 'tmax_avg'),
 (0.08641833871247334, 'fire_year'),
 (0.08124274338540229, 'prcp_avg'),
 (0.022641774502105385, 'snwd_avg'),
 (0.015182030174228138, 'fuel_model_X'),
 (0.014312592912996797, 'fuel_model_A'),
 (0.013066409863103727, 'fuel_model_C'),
 (0.012984201310892651, 'fuel_model_H'),
 (0.012416502907622843, 'fuel_model_L'),
 (0.011517762255311342, 'general_cause_1'),
 (0.00886351595059461, 'fuel_model_F'),
 (0.00823943377009912, 'general_cause_2'),
 (0.008181123291961856, 'general_cause_3'),
 (0.007813240062556065, 'fuel_model_J'),
 (0.006363429251607861, 'fuel_model_G'),
 (0.0052252204459964985, 'fuel_model_R'),
 (0.005213837721848564, 'fuel_model_I'),
 (0.004643814449125826, 'fuel_model_K'),
 (0.004297844554580528, 'fuel_model_T'),
 (0.0037195331400061934, 'fuel_model_U'),
 (0.0018264898158366534, 'fuel_model_B'),
 (0.0008970706851644453, 'snow_avg')]

In [31]:
# Legend For Fuel Models
#A	Annual grasses (cheat)
#B	Dense Chaparral
#C	Open pine, grass under
#F	Dense Brush (lighter than B)
#G	Conifer, Old growth
#H	Conifer, Second growth
#I	Slash, heavy
#J	Slash, medium
#K	Slash, thinning, P.C., Scattrd
#L	Grass Perennial
#R	Hardwood, summer
#T	Sagebrush, medium dense
#U	Closed canopy pine
#X	Non wildland fuel

In [32]:
# Legend For General Cause
# 1 = Human
# 2 = Nature
# 3 = Uncategorized

In [33]:
# Conusion Matrix to predict fire severity 
cm = confusion_matrix(y_test, predictions)
# DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index=["Class 1", "Class 2","Class 3"], columns=["Predicted 1", "Predicted 2","Predicted 3"])
cm_df

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3
Class 1,1685,129,2
Class 2,511,153,0
Class 3,22,10,0


In [34]:
#Class 1 - one-quarter acre or less;
#Class 2 - more than one-quarter acre, but less than 300 acres;
#Class 3 - 300 acres or more.

In [35]:
#  # Alternative Model with 7 CLasses Conusion Matrix to predict fire severity 
#cm = confusion_matrix(y_test, predictions)
# DataFrame from the confusion matrix.
#cm_df = pd.DataFrame(cm, index=["Class 1", "Class 2","Class 3","Class 4","Class 5", "Class 6", "Class 7"], columns=["Predicted 1", "Predicted 2","Predicted 3","Predicted 4","Predicted 5", "Predicted 6","Predicted 7"])
#cm_df

In [36]:
#Class 1 - one-fourth acre or less;
#Class 2 - more than one-fourth acre, but less than 10 acres;
#Class 3 - 10 acres or more, but less than 100 acres;
#Class 4 - 100 acres or more, but less than 300 acres;
#Class 5 - 300 acres or more, but less than 1,000 acres;
#Class 6 - 1,000 acres or more, but less than 5,000 acres;
#Class 7 - 5,000 acres or more.

In [37]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [38]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 1,Predicted 2,Predicted 3
Class 1,1685,129,2
Class 2,511,153,0
Class 3,22,10,0


Accuracy Score : 0.731687898089172
Classification Report
              precision    recall  f1-score   support

         1.0       0.76      0.93      0.84      1816
         2.0       0.52      0.23      0.32       664
         3.0       0.00      0.00      0.00        32

    accuracy                           0.73      2512
   macro avg       0.43      0.39      0.39      2512
weighted avg       0.69      0.73      0.69      2512



# Combination Sampling With SMOTEENN

In [39]:
fire_smoteen = fire_df.drop(['district','fuel_model', 'unit', "fire_name", "legal", "report_date","fire_number", "county", "total_acres" ], axis=1)
fire_smoteen.head(5)

Unnamed: 0,fire_year,general_cause,prcp_avg,snow_avg,snwd_avg,tmax_avg,tmin_avg,Latitude,Longitude,fueltype_num,fire_severity
1,2008,1,0.003077,0.0,20.0,56.75,38.0,44.814719,-123.376389,9,2.0
2,2008,1,0.001538,0.0,0.0,62.833333,40.5,42.897219,-124.083061,3,1.0
5,2008,1,0.514,0.0,0.0,45.5,36.833333,45.628061,-123.933611,5,1.0
8,2008,1,0.0,0.0,12.5,79.375,39.75,42.118061,-123.687781,5,2.0
10,2008,1,0.0,0.0,35.714286,64.0,33.0,45.603331,-117.7275,2,1.0


In [40]:
x_cols = [i for i in fire_smoteen.columns if i not in ('fire_severity')]
X = fire_smoteen[x_cols]
y = fire_smoteen['fire_severity']

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [42]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [43]:
from imblearn.combine import SMOTEENN
smoteenn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smoteenn.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({1.0: 2206, 2.0: 3397, 3.0: 5304})

In [44]:
from sklearn.linear_model import LogisticRegression
smoteen_model = LogisticRegression(solver='lbfgs',max_iter=100)
smoteen_model.fit(X_resampled, y_resampled)

LogisticRegression()

In [50]:
from sklearn.metrics import confusion_matrix
y_pred = smoteen_model.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)

array([[415, 712, 671],
       [ 88, 308, 283],
       [  1,   6,  28]])

In [51]:
#from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import balanced_accuracy_score
y_pred = smoteen_model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.49480675359028164

In [52]:
# Display the confusion matrix
smoteen_cm = confusion_matrix(y_test, y_pred)
# DataFrame from the confusion matrix.
smoteen_cm_df = pd.DataFrame(cm, index=["Class 1", "Class 2","Class 3"], columns=["Predicted 1", "Predicted 2","Predicted 3"])
smoteen_cm_df

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3
Class 1,1685,129,2
Class 2,511,153,0
Class 3,22,10,0


In [53]:
# # Alternative Model with 7 CLasses - Display the confusion matrix
#smoteen_cm = confusion_matrix(y_test, y_pred)
# DataFrame from the confusion matrix.
#smoteen_cm_df = pd.DataFrame(cm, index=["Class 1", "Class 2","Class 3","Class 4","Class 5", "Class 6", "Class 7"], columns=["Predicted 1", "Predicted 2","Predicted 3","Predicted 4","Predicted 5", "Predicted 6","Predicted 7"])
#smoteen_cm_df

In [54]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        1.0       0.82      0.23      0.88      0.36      0.45      0.19      1798
        2.0       0.30      0.45      0.61      0.36      0.53      0.27       679
        3.0       0.03      0.80      0.61      0.06      0.70      0.50        35

avg / total       0.67      0.30      0.80      0.36      0.47      0.22      2512

