<a href="https://colab.research.google.com/github/hayleypc/HawaiiClimate/blob/main/SOC_gan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
import geopandas as gpd

In [2]:

# Define the generator model
def build_generator(latent_dim, sequence_length):
    model = models.Sequential([
        layers.Dense(128, activation='relu', input_dim=latent_dim),
        layers.BatchNormalization(),
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(sequence_length, activation='tanh')
    ])
    return model


In [3]:

# Define the discriminator model
def build_discriminator(sequence_length):
    model = models.Sequential([
        layers.Dense(512, activation='relu', input_dim=sequence_length),
        layers.Dense(256, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    return model


In [4]:

# Define the GAN model
def build_gan(generator, discriminator):
    discriminator.trainable = False
    model = models.Sequential([
        generator,
        discriminator
    ])
    return model


In [66]:
# Update the training function to include validation
def train_gan(generator, discriminator, gan, epochs, batch_size, latent_dim, val_gen):
    for epoch in range(epochs):
        # Train on batches from the training generator
        for _ in range(len(train_sequences) // batch_size):
            # Generate fake sequences
            noise = tf.random.normal(shape=(batch_size, latent_dim))
            fake_sequences = generator.predict(noise)

            # Get a batch of real sequences from the training generator
            real_sequences = next(train_gen)

            # Labels for real and fake data
            real_labels = tf.ones((batch_size, 1))
            fake_labels = tf.zeros((batch_size, 1))

            # Train the discriminator
            discriminator.trainable = True
            d_loss_real = discriminator.train_on_batch(real_sequences, real_labels)
            d_loss_fake = discriminator.train_on_batch(fake_sequences, fake_labels)
            d_loss = 0.5 * tf.add(d_loss_real, d_loss_fake)

            # Train the generator
            discriminator.trainable = False
            noise = tf.random.normal(shape=(batch_size, latent_dim))
            g_loss = gan.train_on_batch(noise, real_labels)

        # Validation
        val_real_sequences = next(val_gen)
        val_fake_sequences = generator.predict(tf.random.normal(shape=(batch_size, latent_dim)))
        val_d_loss_real = discriminator.evaluate(val_real_sequences, tf.ones((batch_size, 1)), verbose=0)
        val_d_loss_fake = discriminator.evaluate(val_fake_sequences, tf.zeros((batch_size, 1)), verbose=0)
        val_d_loss = 0.5 * np.add(val_d_loss_real, val_d_loss_fake)

        # Print the progress
        print(f"Epoch: {epoch+1}/{epochs}, D Loss: {d_loss}, G Loss: {g_loss}, Val D Loss: {val_d_loss}")


In [None]:
file_path = '/content/drive/My Drive/sequence_data.csv'

In [11]:
drivers_gpd = gpd.read_file('/content/drive/MyDrive/hawaii_soils/Analysis Data/250_summary_grid_dt.gpkg')

In [12]:
soils_csv = gpd.read_file('/content/drive/MyDrive/hawaii_soils/HI soils data/combined_soc_2024_04_05.csv')

In [25]:
from shapely.geometry import Point

soils_csv = soils_csv[(soils_csv['latitude'] != '') & (soils_csv['longitude'] != '')]
soils_csv['geometry'] = soils_csv.apply(lambda row: Point(float(row['longitude']), float(row['latitude'] )), axis=1)
soils_gpd = gpd.GeoDataFrame(soils_csv, geometry='geometry', crs="EPSG:4326")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [27]:
# Ensure both GeoDataFrames have the same CRS
soils_gpd = soils_gpd.to_crs(drivers_gpd.crs)

# Perform spatial join
matched_data = gpd.sjoin_nearest(soils_gpd, drivers_gpd, how='left', distance_col='distance')

In [40]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

id_fields = matched_data[['source_dataset', 'island', 'soil_column_id', 'unique_id', 'depth_top', 'depth_bottom', 'depth_adj_bottom', 'latitude', 'longitude']]

# Select only numeric columns
numeric_cols = matched_data.select_dtypes(include=[np.number])

# Initialize the scaler
scaler = MinMaxScaler ()

# Fit the scaler on the numeric columns
scaler.fit(numeric_cols)

# Transform the numeric columns
scaled_numeric_cols = scaler.transform(numeric_cols)

# Convert the scaled numeric columns back to a DataFrame
scaled_numeric_df = pd.DataFrame(scaled_numeric_cols, columns=numeric_cols.columns, index=numeric_cols.index)

# Concatenate the ID fields back with the numeric columns
numeric_df = pd.concat([id_fields, scaled_numeric_df], axis=1)

In [42]:
# numeric_df

In [56]:

# Set the dimensions and compile the models
latent_dim = 100
sequence_length = 38  # Adjust based on your sequence length

generator = build_generator(latent_dim, sequence_length)
discriminator = build_discriminator(sequence_length)
gan = build_gan(generator, discriminator)

# Compile the discriminator
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Compile the GAN
gan.compile(optimizer='adam', loss='binary_crossentropy')

# # Assuming 'final_df' is your scaled dataset with numeric columns and ID fields
# # Extract only the numeric columns for the GAN
# numeric_columns = [col for col in scaled_numeric_cols.columns if scaled_numeric_cols[col].dtype in [np.float32, np.float64]]
# real_sequences_df = scaled_numeric_cols[numeric_columns]

# Convert the DataFrame to a NumPy array
real_sequences_array = scaled_numeric_cols

def real_sequence_generator(data, batch_size):
    while True:
        # Shuffle the data at the beginning of each epoch
        np.random.shuffle(data)
        for i in range(0, len(data), batch_size):
            batch = data[i:i + batch_size]
            # If the batch is smaller than the batch size, pad it with samples from the beginning
            if len(batch) < batch_size:
                padding = data[:(batch_size - len(batch))]
                batch = np.concatenate([batch, padding], axis=0)
            yield batch


# Create an instance of the generator
# real_sequence_gen = real_sequence_generator(real_sequences_array, batch_size)


In [68]:
# Split the data into training and validation sets
from sklearn.model_selection import train_test_split

train_sequences, val_sequences = train_test_split(real_sequences_array, test_size=0.2, random_state=42)

test_sequences, val_sequences = train_test_split(val_sequences, test_size=0.5, random_state=42)

batch_size = 128  # Set the batch size


# Define the training and validation generators
train_gen = real_sequence_generator(train_sequences, batch_size)
test_gen = real_sequence_generator(test_sequences, batch_size)
val_gen = real_sequence_generator(val_sequences, batch_size)


In [92]:
next(val_gen).shape

(128, 38)

In [98]:
noise = tf.random.normal(shape=(batch_size, latent_dim))
synthetic_samples = generator.predict(noise)




In [96]:

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

def validate_gan(generator, val_gen, key_columns, non_key_columns, latent_dim, num_samples=1000, val_steps=10):
    # Map column names to indices
    col_indices = {col: i for i, col in enumerate(key_columns + non_key_columns)}

    rmse_values = []
    for _ in range(val_steps):
        real_samples = next(val_gen)
        for real_sample in real_samples:
            # Generate synthetic samples
            noise = tf.random.normal(shape=(batch_size, latent_dim))
            synthetic_samples = generator.predict(noise)


            # Calculate similarities based on key columns and select the top 1%
            similarities = []
            for synthetic_sample in synthetic_samples:
                similarity = sum([abs(synthetic_sample[col_indices[key_col]] - real_sample[col_indices[key_col]]) / real_sample[col_indices[key_col]] for key_col in key_columns])
                similarities.append(similarity)
            top_1_percent_indices = np.argsort(similarities)[:num_samples // 100]
            filtered_samples = synthetic_samples[top_1_percent_indices]

            # Calculate RMSE for non-key columns
            if len(filtered_samples) > 0:
                filtered_samples_df = pd.DataFrame(filtered_samples, columns=non_key_columns)
                avg_filtered_values = filtered_samples_df.mean()
                real_values = [real_sample[col_indices[col]] for col in non_key_columns]

                rmse = np.sqrt(mean_squared_error(real_values, avg_filtered_values))
                rmse_values.append(rmse)

    # Print the average RMSE
    if rmse_values:
        print("Average RMSE:", np.mean(rmse_values))
    else:
        print("No filtered samples found for RMSE calculation.")

# Example usage
# validate_gan(generator, val_gen, key_columns=['elevation','landform','SRTM_mTPI'], non_key_columns=['agbd_m'], latent_dim=latent_dim, num_samples=1000, val_steps=10)





In [136]:
noise = tf.random.normal(shape=(32, latent_dim))
synthetic_samples = generator.predict(noise)
print(synthetic_samples[0])
print(synthetic_samples[1])

[-0.9999994   0.99999946  0.5901518   0.9247776   0.01472434 -0.65572166
  0.532575   -0.99999696  0.6609164   0.99999803  0.9999981   0.7058845
  0.99999964  0.84487134  0.99999946  0.99999964  0.99999964 -0.9999998
  0.9999995   0.99999964  0.46403897 -0.8549045  -0.9999995  -0.9999997
  0.9999963   0.9999989  -0.9999994  -0.8813151  -0.9999997  -0.9999997
  0.99999964  0.9994447  -0.9999993   0.708036   -0.99999964  0.99999875
 -0.99999946 -0.9999998 ]
[-0.99999964  0.9999997   0.5895628   0.9254385   0.0135832  -0.6536944
  0.53159255 -0.9999973   0.6586854   0.99999785  0.99999833  0.70467937
  0.9999995   0.84414834  0.9999996   0.9999998   0.9999995  -0.99999964
  0.9999995   0.9999996   0.46180367 -0.85383993 -0.9999995  -0.99999964
  0.999996    0.9999988  -0.9999993  -0.88128304 -0.9999997  -0.99999964
  0.99999946  0.9993853  -0.99999917  0.70958424 -0.9999996   0.9999987
 -0.9999995  -0.9999997 ]


In [128]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

def validate_gan(generator, val_gen, key_columns, non_key_columns, latent_dim, num_samples=1024, val_steps=10):
    # Map column names to indices
    col_indices = {col: i for i, col in enumerate(key_columns + non_key_columns)}
    print(col_indices)
    rmse_values = []
    for _ in range(val_steps):
        real_samples = next(val_gen)
        for real_sample in real_samples:
            # Generate synthetic samples
            noise = tf.random.normal(shape=(batch_size, latent_dim))
            synthetic_samples = generator.predict(noise)

            # Calculate similarities based on key columns and select the top 1%
            similarities = []
            for synthetic_sample in synthetic_samples:
                similarity = sum([abs(synthetic_sample[col_indices[key_col]] - real_sample[col_indices[key_col]]) / real_sample[col_indices[key_col]] for key_col in key_columns])
                similarities.append(similarity)
            top_1_percent_indices = np.argsort(similarities)[:num_samples // 100]
            filtered_samples = synthetic_samples[top_1_percent_indices]
            filtered_top_1_percent_indices = [similarities[index_id] for index_id in top_1_percent_indices]

            print(filtered_top_1_percent_indices)
            # Calculate RMSE for non-key columns
            if len(filtered_samples) > 0:
                filtered_samples_df = pd.DataFrame(filtered_samples, columns=non_key_columns)
                avg_filtered_values = filtered_samples_df.mean()
                real_values = [real_sample[col_indices[col]] for col in non_key_columns]

                rmse = np.sqrt(mean_squared_error(real_values, avg_filtered_values))
                rmse_values.append(rmse)

    # Print the average RMSE
    if rmse_values:
        print("Average RMSE:", np.mean(rmse_values))
    else:
        print("No filtered samples found for RMSE calculation.")

# Example usage
# validate_gan(generator, val_gen, key_columns=['elevation', 'landform', 'SRTM_mTPI'], non_key_columns=['agbd_m'], latent_dim=latent_dim, num_samples=1000, val_steps=10)


In [129]:
validate_gan(generator, val_gen, ['elevation','landform','SRTM_mTPI'], ['agbd_m'], latent_dim, num_samples=1024, val_steps=10)


{'elevation': 0, 'landform': 1, 'SRTM_mTPI': 2, 'agbd_m': 3}
[4.270690700242639, 4.271294311095827, 4.275749682025146, 4.275782262608924, 4.277142713923719, 4.278046058478687, 4.2804237905496265, 4.281409692646718, 4.282127291437285, 4.282394084982934]


ValueError: Shape of passed values is (10, 38), indices imply (10, 1)

In [None]:
# Train the GAN with validation
train_gan(generator, discriminator, gan, epochs=10000, batch_size=128, latent_dim=latent_dim, val_gen=val_gen)

Epoch: 1/10000, D Loss: [nan  0.], G Loss: 0.31083250045776367, Val D Loss: [nan  0.]
Epoch: 2/10000, D Loss: [nan  0.], G Loss: 0.31078723073005676, Val D Loss: [       nan 0.00390625]
Epoch: 3/10000, D Loss: [nan  0.], G Loss: 0.3108453154563904, Val D Loss: [       nan 0.00390625]
Epoch: 4/10000, D Loss: [nan  0.], G Loss: 0.3108657896518707, Val D Loss: [nan  0.]
Epoch: 5/10000, D Loss: [nan  0.], G Loss: 0.3108525276184082, Val D Loss: [nan  0.]
Epoch: 6/10000, D Loss: [nan  0.], G Loss: 0.310832679271698, Val D Loss: [       nan 0.00390625]
Epoch: 7/10000, D Loss: [nan  0.], G Loss: 0.3107938766479492, Val D Loss: [nan  0.]
Epoch: 8/10000, D Loss: [       nan 0.00390625], G Loss: 0.3107747435569763, Val D Loss: [       nan 0.00390625]
Epoch: 9/10000, D Loss: [nan  0.], G Loss: 0.31078988313674927, Val D Loss: [nan  0.]
Epoch: 10/10000, D Loss: [nan  0.], G Loss: 0.31084176898002625, Val D Loss: [       nan 0.00390625]


In [75]:
column_names = scaled_numeric_df.columns.tolist()
column_names

['index_right',
 'id',
 'left',
 'top',
 'right',
 'bottom',
 'water',
 'trees',
 'grass',
 'flooded_vegetation',
 'crops',
 'shrub_and_scrub',
 'built',
 'bare',
 'snow_and_ice',
 'max',
 'elevation',
 'landform',
 'SRTM_mTPI',
 'aet',
 'def',
 'pdsi',
 'pet',
 'pr',
 'ro',
 'soil',
 'srad',
 'swe',
 'tmmn',
 'tmmx',
 'vap',
 'vpd',
 'vs',
 'agbd_m',
 'agbd_sd',
 'agbd_n',
 'age_years',
 'distance']

In [73]:
val_gen

<generator object real_sequence_generator at 0x791c4b5a7c30>

In [82]:
tf.keras.backend.clear_session()


In [112]:
# Example usage
validate_gan(generator, val_gen, ['elevation','landform','SRTM_mTPI'], ['agbd_m'], latent_dim, num_samples=1024, val_steps=10)
# validate_gan(generator, scaled_numeric_df, key_columns=['elevation','landform','SRTM_mTPI'], non_key_columns=['agbd_m'], latent_dim=latent_dim)




ValueError: Shape of passed values is (10, 38), indices imply (10, 1)