# Earthquake Dataset Analysis (Fixed Version)
This notebook performs Exploratory Data Analysis (EDA), Data Transformation with datetime features, and Feature Selection.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer


In [None]:

# Load dataset
file_path = "earthquake-dataset.csv"  # Ensure the file is in the same folder as the notebook
df = pd.read_csv(file_path)

# Preview dataset
df.head()


In [None]:

# Basic Info
df.info()

# Summary statistics
df.describe(include="all").T


In [None]:

# Missing values heatmap
plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()

# Percentage of missing values
df.isnull().mean()*100


In [None]:

# --- Datetime Feature Engineering ---
# Convert Date + Time to datetime
df["Datetime"] = pd.to_datetime(df["Date"] + " " + df["Time"], errors="coerce")

# Extract useful time features
df["Year"] = df["Datetime"].dt.year
df["Month"] = df["Datetime"].dt.month
df["Day"] = df["Datetime"].dt.day
df["Hour"] = df["Datetime"].dt.hour

# Drop original Date, Time, and Datetime columns + ID-like columns
df = df.drop(columns=["Date", "Time", "Datetime", "ID", "Source", "Location Source", "Magnitude Source"])

# Encode categorical variables
for col in df.select_dtypes(include="object").columns:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Impute missing numeric values with median
imputer = SimpleImputer(strategy="median")
df[df.select_dtypes(include=np.number).columns] = imputer.fit_transform(df.select_dtypes(include=np.number))

# Scale numeric values
scaler = StandardScaler()
df[df.select_dtypes(include=np.number).columns] = scaler.fit_transform(df.select_dtypes(include=np.number))

df.head()


In [None]:

plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.show()


In [None]:

# --- Feature Selection ---
# Define target (Magnitude) and features
X = df.drop(columns=["Magnitude"])  
y = df["Magnitude"]

# Discretize magnitude into classes
y_class = pd.cut(y, bins=[-np.inf, 4.0, 6.0, np.inf], labels=["Low", "Medium", "High"])

# Random Forest for Feature Importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y_class)

# Feature importance plot
importances = pd.Series(rf.feature_importances_, index=X.columns)
plt.figure(figsize=(12,6))
sns.barplot(x=importances.sort_values(ascending=False), y=importances.sort_values(ascending=False).index)
plt.title("Feature Importance from Random Forest")
plt.show()
