In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

# Load the dataset
file_path = '/content/synthetic_disaster_data.xlsx'
data = pd.read_excel(file_path)

# Display the first few rows of the dataset
print(data.head())

# Data preprocessing
# Identify numerical and categorical features
numerical_features = ['Latitude', 'Longitude', 'Elevation', 'Temperature', 'Humidity', 'Wind Speed', 'Precipitation', 'Seismic Activity', 'Proximity to Fault Line', 'Population Density', 'Building Density']
categorical_features = ['City', 'Region', 'Soil Composition', 'Proximity to Water Body', 'Land Use', 'Vegetation Cover', 'Infrastructure Condition', 'Industrial Activity']

# Define target variables
target_occurrence = 'Disaster Occurrence'
target_type = 'Disaster Type'

# Split the data
X = data[numerical_features + categorical_features]
y_occurrence = data[target_occurrence]
y_type = data[target_type]

# Preprocessing pipelines for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model_occurrence = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

model_type = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split the data into training and testing sets
X_train, X_test, y_train_occurrence, y_test_occurrence = train_test_split(X, y_occurrence, test_size=0.2, random_state=42)
X_train, X_test, y_train_type, y_test_type = train_test_split(X, y_type, test_size=0.2, random_state=42)

# Train the models
model_occurrence.fit(X_train, y_train_occurrence)
model_type.fit(X_train, y_train_type)

# Save the models
joblib.dump(model_occurrence, 'model_occurrence.pkl')
joblib.dump(model_type, 'model_type.pkl')


        Date   Time  Latitude  Longitude         City           Region  \
0 2024-01-18  00:01   50.8755    24.1498       Sydney  New South Wales   
1 2024-03-01  15:14    7.2497   -22.4738       London          England   
2 2024-10-12  15:46   66.8118   174.7609  Los Angeles       California   
3 2024-07-10  15:55  -36.0468  -149.0051       London          England   
4 2024-11-11  13:32  -89.4221  -164.5041     Mysuru         Mysuru   

   Elevation  Temperature  Humidity  Wind Speed  ...  Proximity to Water Body  \
0       98.3         30.4        16          14  ...                       94   
1       21.9         16.0        56          12  ...                       79   
2       82.4         39.7        35           0  ...                       78   
3       89.3          9.7        58          10  ...                        0   
4       11.9          6.7        81           8  ...                       66   

       Land Use  Vegetation Cover Population Density  Building Density  

ValueError: could not convert string to float: 'High'

# **Sample Data** **Generation**

In [6]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Function to generate random date and time
def random_date(start, end):
    return start + timedelta(days=random.randint(0, int((end - start).days)))

# Generating synthetic data
num_samples = 50
start_date = datetime.strptime('2024-01-01', '%Y-%m-%d')
end_date = datetime.strptime('2024-12-31', '%Y-%m-%d')

data = []
cities = ['Los Angeles', 'Mysuru', 'Tokyo', 'London', 'Sydney']
regions = ['California', 'Mysuru', 'Kanto', 'England', 'New South Wales']
soil_compositions = ['Clay', 'Sandy', 'Rocky', 'Silt']
land_uses = ['Urban', 'Rural', 'Forest', 'Agricultural']
vegetation_covers = ['Low', 'Medium', 'High']
building_densities = ['Low', 'Medium', 'High', 'Very High']
infrastructure_conditions = ['Poor', 'Fair', 'Good', 'Excellent']
industrial_activities = ['Low', 'Medium', 'High', 'Very High']
disaster_types = ['None', 'Flood', 'Earthquake', 'Industrial']

for _ in range(num_samples):
    date = random_date(start_date, end_date)
    time = f"{random.randint(0, 23):02d}:{random.randint(0, 59):02d}"
    latitude = round(np.random.uniform(-90, 90), 4)
    longitude = round(np.random.uniform(-180, 180), 4)
    city = random.choice(cities)
    region = regions[cities.index(city)]
    elevation = round(np.random.uniform(0, 100), 1)
    temperature = f"{round(np.random.uniform(-10, 40), 1)}"#°C
    humidity = f"{random.randint(10, 100)}"#%"
    wind_speed = f"{random.randint(0, 20)}"# km/h"
    precipitation = f"{random.randint(0, 100)}"# mm"
    seismic_activity = round(np.random.uniform(0, 10), 1)
    proximity_to_fault_line = f"{random.randint(0, 500)}"# km"
    soil_composition = random.choice(soil_compositions)
    proximity_to_water_body = f"{random.randint(0, 100)} "#km"
    land_use = random.choice(land_uses)
    vegetation_cover = random.choice(vegetation_covers)
    population_density = random.randint(0, 20000)
    building_density = random.choice(building_densities)
    infrastructure_condition = random.choice(infrastructure_conditions)
    industrial_activity = random.choice(industrial_activities)
    historical_incidents = random.randint(0, 10)
    disaster_occurrence = random.randint(0, 1)
    disaster_type = 'None' if disaster_occurrence == 0 else random.choice(disaster_types[1:])

    data.append([date, time, latitude, longitude, city, region, elevation, temperature, humidity, wind_speed, precipitation,
                 seismic_activity, proximity_to_fault_line, soil_composition, proximity_to_water_body, land_use,
                 vegetation_cover, population_density, building_density, infrastructure_condition, industrial_activity,
                 historical_incidents, disaster_occurrence, disaster_type])

# Create a DataFrame
columns = ['Date', 'Time', 'Latitude', 'Longitude', 'City', 'Region', 'Elevation', 'Temperature', 'Humidity', 'Wind Speed',
           'Precipitation', 'Seismic Activity', 'Proximity to Fault Line', 'Soil Composition', 'Proximity to Water Body',
           'Land Use', 'Vegetation Cover', 'Population Density', 'Building Density', 'Infrastructure Condition',
           'Industrial Activity', 'Historical Incidents', 'Disaster Occurrence', 'Disaster Type']

df = pd.DataFrame(data, columns=columns)

# Save to Excel
file_path = '/content/synthetic_disaster_data.xlsx'
df.to_excel(file_path, index=False)

print(f"Synthetic data saved to {file_path}")


Synthetic data saved to /content/synthetic_disaster_data.xlsx
