In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Set the file path
file_path = '/content/drive/My Drive/Conference paper/dataset/tornado_path.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,storm_date,storm_time,time_zone_offset,state_abbreviation,state_name,state_fips_code,magnitude,injured_count,fatality_count,property_loss,crop_loss,yearly_tornado_count,start_lon,start_lat,end_long,end_lat,length,width,tornado_path_geom
0,2015-06-07,00:15:00,-06:00,IA,Iowa,19,0,0,0,0.015,0.004,575683,-94.0213,42.0995,-93.9673,42.1081,2.83,120.0,"LINESTRING(-94.0213 42.0995, -93.9673 42.1081)"
1,2015-11-11,14:00:00,-06:00,IA,Iowa,19,1,0,0,0.405,0.0,602617,-94.5585,40.7157,-94.3545,40.9904,21.8,1350.0,"LINESTRING(-94.5585 40.7157, -94.4565 40.85305..."
2,2016-09-21,17:32:00,-06:00,IA,Iowa,19,0,0,0,3000.0,3000.0,614379,-92.7308,42.9155,-92.7105,42.9341,1.65,150.0,"LINESTRING(-92.7308 42.9155, -92.7105 42.9341)"
3,2008-05-01,17:59:00,-06:00,IA,Iowa,19,2,0,0,0.51,0.0,553,-96.3,43.14,-96.42,43.3,12.95,1200.0,"LINESTRING(-96.3 43.14, -96.42 43.3)"
4,2017-06-28,16:05:00,-06:00,IA,Iowa,19,2,0,0,75000.0,10000.0,615497,-94.7771,40.5851,-94.6462,40.6043,6.99,3000.0,"LINESTRING(-94.7771 40.5851, -94.6462 40.6043)"


In [None]:
# 📌 Install necessary libraries if missing
!pip install tensorflow xgboost pandas scikit-learn --quiet

# 📌 Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score





# 📌 Display basic dataset information
print("Dataset Overview:")
print(df.info())

# 📌 Convert date column to numeric (days since earliest tornado)
df["storm_date"] = pd.to_datetime(df["storm_date"])
df["days_since_first_tornado"] = (df["storm_date"] - df["storm_date"].min()).dt.days

# 📌 Convert storm_time to hour format
df["storm_time"] = pd.to_datetime(df["storm_time"], format='%H:%M:%S').dt.hour

# 📌 Drop unnecessary or non-numeric columns
df.drop(columns=["state_name", "tornado_path_geom", "time_zone_offset", "storm_date"], inplace=True)  # Drop non-relevant columns

# 📌 Encode categorical variables
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 📌 Handle missing values (fill only numeric columns)
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# 📌 Define features (X) and target (y)
target_col = "magnitude"  # Change this to the actual target column for prediction
X = df.drop(columns=[target_col])
y = df[target_col]

# 📌 Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 📌 Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 📌 Train & Evaluate Random Forest Model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_scaled, y_train)
rf_preds = rf_regressor.predict(X_test_scaled)

# 📌 Train & Evaluate XGBoost Model
xgb_regressor = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_regressor.fit(X_train_scaled, y_train)
xgb_preds = xgb_regressor.predict(X_test_scaled)

# 📌 Train & Evaluate LSTM Model
X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], X_train_scaled.shape[1], 1))
X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], X_test_scaled.shape[1], 1))

lstm_model = Sequential([
    LSTM(50, activation='relu', input_shape=(X_train_lstm.shape[1], 1), return_sequences=True),
    Dropout(0.2),
    LSTM(25, activation='relu'),
    Dense(1)
])

lstm_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
lstm_model.fit(X_train_lstm, y_train, epochs=20, batch_size=32, verbose=0)
lstm_preds = lstm_model.predict(X_test_lstm).flatten()

# 📌 Compute Regression Metrics
metrics = {
    "Model": ["Random Forest", "XGBoost", "LSTM"],
    "MAE": [
        mean_absolute_error(y_test, rf_preds),
        mean_absolute_error(y_test, xgb_preds),
        mean_absolute_error(y_test, lstm_preds)
    ],
    "RMSE": [
        mean_squared_error(y_test, rf_preds) ** 0.5,
        mean_squared_error(y_test, xgb_preds) ** 0.5,
        mean_squared_error(y_test, lstm_preds) ** 0.5
    ],
    "R2 Score": [
        r2_score(y_test, rf_preds),
        r2_score(y_test, xgb_preds),
        r2_score(y_test, lstm_preds)
    ]
}

# 📌 Convert results to DataFrame and display
metrics_df = pd.DataFrame(metrics)
print("\n📊 Model Performance Comparison:\n")
print(metrics_df)


Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   storm_date            16000 non-null  object 
 1   storm_time            16000 non-null  object 
 2   time_zone_offset      16000 non-null  object 
 3   state_abbreviation    16000 non-null  object 
 4   state_name            16000 non-null  object 
 5   state_fips_code       16000 non-null  int64  
 6   magnitude             16000 non-null  object 
 7   injured_count         16000 non-null  int64  
 8   fatality_count        16000 non-null  int64  
 9   property_loss         16000 non-null  float64
 10  crop_loss             16000 non-null  float64
 11  yearly_tornado_count  16000 non-null  int64  
 12  start_lon             16000 non-null  float64
 13  start_lat             16000 non-null  float64
 14  end_long              16000 non-null  float64
 15  e

  super().__init__(**kwargs)


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step

📊 Model Performance Comparison:

           Model       MAE      RMSE  R2 Score
0  Random Forest  0.332522  0.548497  0.650016
1        XGBoost  0.352326  0.537962  0.663331
2           LSTM  0.418998  0.652476  0.504746


In [None]:
import numpy as np
import pandas as pd

# Given MAE values for Random Forest, XGBoost, and LSTM
mae_values = np.array([0.332522, 0.352326, 0.418998])

# Assuming a sample mean target value for normalization (Replace with actual mean of y_test)
y_test_mean = 1.0  # Modify this based on your dataset

# Compute Accuracy using a reliable formula
accuracy_values = 1 - (mae_values / (y_test_mean + 1e-8))  # Avoid division by zero

# Convert to DataFrame for better visualization
accuracy_results = {
    "Model": ["Random Forest", "XGBoost", "LSTM"],
    "Accuracy": accuracy_values
}

accuracy_df = pd.DataFrame(accuracy_results)
print(accuracy_df)


           Model  Accuracy
0  Random Forest  0.667478
1        XGBoost  0.647674
2           LSTM  0.581002
