<a href="https://colab.research.google.com/github/hayleypc/HawaiiClimate/blob/main/soil_gan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import geopandas as gpd

In [4]:
# Define the generator model
def build_generator(latent_dim, sequence_length):
    model = models.Sequential([
        layers.Dense(256, activation='relu', input_dim=latent_dim),
        layers.BatchNormalization(),
        layers.LeakyReLU(alpha=0.2),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.LeakyReLU(alpha=0.2),
        layers.Dense(1024, activation='relu'),
        layers.BatchNormalization(),
        layers.LeakyReLU(alpha=0.2),
        layers.Dense(sequence_length, activation='tanh')
    ])
    return model

In [5]:
# Define the discriminator model
def build_discriminator(sequence_length):
    model = models.Sequential([
        layers.Dense(1024, activation='relu', input_dim=sequence_length),
        layers.Dropout(0.3),
        layers.Dense(512, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

In [6]:

# Define the GAN model
def build_gan(generator, discriminator):
    discriminator.trainable = False
    model = models.Sequential([
        generator,
        discriminator
    ])
    return model


In [37]:
# Update the training function to include validation
def train_gan(generator, discriminator, gan, epochs, batch_size, latent_dim, val_gen,keep_cols = [0,1,2,3]):
    for epoch in range(epochs):
        # Train on batches from the training generator
        for _ in range(len(train_sequences) // batch_size):
            # Generate fake sequences
            noise = tf.random.normal(shape=(batch_size, latent_dim))
            fake_sequences = generator.predict(noise)

            # Get a batch of real sequences from the training generator
            real_sequences = next(train_gen)

            # Labels for real and fake data
            real_labels = tf.ones((batch_size, 1))
            fake_labels = tf.zeros((batch_size, 1))

            # Train the discriminator
            discriminator.trainable = True
            d_loss_real = discriminator.train_on_batch(real_sequences, real_labels)
            d_loss_fake = discriminator.train_on_batch(fake_sequences, fake_labels)
            d_loss = 0.5 * tf.add(d_loss_real, d_loss_fake)

            # Train the generator
            discriminator.trainable = False
            noise = tf.random.normal(shape=(batch_size, latent_dim))
            g_loss = gan.train_on_batch(noise, real_labels)

        # Validation
        val_real_sequences = next(val_gen)
        val_fake_sequences = generator.predict(tf.random.normal(shape=(batch_size, latent_dim)))
        val_d_loss_real = discriminator.evaluate(val_real_sequences, tf.ones((batch_size, 1)), verbose=0)
        val_d_loss_fake = discriminator.evaluate(val_fake_sequences, tf.zeros((batch_size, 1)), verbose=0)
        val_d_loss = 0.5 * np.add(val_d_loss_real, val_d_loss_fake)

        # Print the progress
        print(f"Epoch: {epoch+1}/{epochs}, D Loss: {d_loss}, G Loss: {g_loss}, Val D Loss: {val_d_loss}")


        if epoch % 100 == 0:
          # data = validate_generator()
          data = validate_generator(keep_cols = keep_cols)
          x_values = [ datum[0] for datum in data]
          y_values = [ datum[1] for datum in data]

          # Create a scatter plot
          plt.scatter(x_values, y_values, color='blue')

          # Add labels and title
          plt.xlabel('Sample Index')
          plt.ylabel('Difference between Real and Generated')
          plt.title('Scatter Plot of Validation Differences')

          # Show the plot
          plt.show()


In [46]:
def validate_generator(n_lowest = 10, n_samp = 256,keep_cols = [0,1,2,3]):
  # n_lowest = 10
  result_vec = []

  batch_val = next(val_gen)
  for batch_ in batch_val:
    noise = tf.random.normal(shape=(n_samp, latent_dim))
    synthetic_samples = generator.predict(noise)
    # batch_ = batch_val[1]
    match_indices = [i for i in range(len(keep_cols)-1)]
    # keep_cols[:(len(keep_cols)-1)]

    data = [np.mean(np.sqrt((batch_[match_indices] - samp[match_indices])**2)) for samp in synthetic_samples]

    indices = sorted(range(len(data)), key=lambda i: data[i])[:n_lowest]

    lowest_data = [data[i] for i in indices]

    most_similar = [synthetic_samples[i] for i in indices]

    most_similar_scaled_df = pd.DataFrame(scaler.inverse_transform(pd.DataFrame(most_similar, columns=numeric_cols.columns)), columns=numeric_cols.columns)

    original_scaled_df = pd.DataFrame(scaler.inverse_transform(pd.DataFrame(batch_.reshape(1,-1), columns=numeric_cols.columns)), columns=numeric_cols.columns)

    error_vec = [np.mean(original_scaled_df["imp_c_float"]) ,np.mean(most_similar_scaled_df["imp_c_float"]) ]

    # # most_similar_scaled_df = scaler.inverse_transform( most_similar_df)

    # original_scaled = [scaler.inverse_transform(sim.reshape(1,-1)) for sim in [batch_]]

    # most_similar_scaled_2d = [np.squeeze(sim) for sim in most_similar_scaled]

    # original_scaled_2d = [np.squeeze(sim) for sim in original_scaled]

    # most_similar_scaled_df = pd.DataFrame(most_similar_scaled_2d, columns=numeric_cols.columns)

    # original_scaled_df = pd.DataFrame(original_scaled_2d, columns=numeric_cols.columns)

    # result = [np.mean(most_similar_scaled_df['imp_c_float']), np.mean(original_scaled_df['imp_c_float'])]
    # print(np.mean(original_scaled_df['agbd_m']))
    result_vec.append(error_vec)
  return(result_vec)

In [9]:
file_path = '/content/drive/My Drive/sequence_data.csv'

In [10]:
drivers_gpd = gpd.read_file('/content/drive/MyDrive/hawaii_soils/Analysis Data/250_summary_grid_dt.gpkg')

In [11]:
soils_csv = gpd.read_file('/content/drive/MyDrive/hawaii_soils/HI soils data/combined_soc_2024_04_05.csv')

In [12]:
from shapely.geometry import Point

soils_csv = soils_csv[(soils_csv['latitude'] != '') & (soils_csv['longitude'] != '')]
soils_csv['geometry'] = soils_csv.apply(lambda row: Point(float(row['longitude']), float(row['latitude'] )), axis=1)
soils_gpd = gpd.GeoDataFrame(soils_csv, geometry='geometry', crs="EPSG:4326")

In [13]:
# Ensure both GeoDataFrames have the same CRS
soils_gpd = soils_gpd.to_crs(drivers_gpd.crs)

# Perform spatial join
matched_data = gpd.sjoin_nearest(soils_gpd, drivers_gpd, how='left', distance_col='distance')

In [14]:
unique_rows = matched_data[matched_data["depth_adj_bottom"] == '20']
unique_rows = unique_rows.drop_duplicates(subset=['latitude', 'longitude'])
unique_rows = unique_rows[unique_rows['distance'] < 251]
matched_data = unique_rows

In [15]:
matched_data['imp_c_float'] = [float(datum) for datum in matched_data['imp_c']]

In [16]:
import numpy as np
len(matched_data.select_dtypes(include=[np.number]).columns)

39

In [17]:
matched_data.columns

Index(['source_dataset', 'island', 'soil_column_id', 'unique_id', 'depth_top',
       'depth_bottom', 'depth_adj_bottom', 'latitude', 'longitude', 'c_perc',
       'imp_c', 'bulk_density', 'imp_bd', 'hole_id', 'geometry', 'index_right',
       'id', 'left', 'top', 'right', 'bottom', 'water', 'trees', 'grass',
       'flooded_vegetation', 'crops', 'shrub_and_scrub', 'built', 'bare',
       'snow_and_ice', 'max', 'elevation', 'landform', 'SRTM_mTPI', 'aet',
       'def', 'pdsi', 'pet', 'pr', 'ro', 'soil', 'srad', 'swe', 'tmmn', 'tmmx',
       'vap', 'vpd', 'vs', 'agbd_m', 'agbd_sd', 'agbd_n', 'majorcomposition',
       'age_years', 'age_class', 'distance', 'imp_c_float'],
      dtype='object')

In [27]:
matched_data.columns[31]

'elevation'

In [18]:
matched_data.columns[48]

'agbd_m'

In [28]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
id_fields = matched_data[['source_dataset', 'island', 'soil_column_id', 'unique_id', 'depth_top', 'depth_bottom', 'depth_adj_bottom', 'latitude', 'longitude']]

keep_cols = [31,32, 33, 55]
# Select only numeric columns
numeric_cols = matched_data.iloc[:,keep_cols]
numeric_cols.replace('', np.nan, inplace=True)
numeric_cols = numeric_cols.astype(float)

# Initialize the scaler
scaler = MinMaxScaler ()

# Fit the scaler on the numeric columns
scaler.fit(numeric_cols)

# Transform the numeric columns
scaled_numeric_cols = scaler.transform(numeric_cols)

# Convert the scaled numeric columns back to a DataFrame
scaled_numeric_df = pd.DataFrame(scaled_numeric_cols, columns=numeric_cols.columns, index=numeric_cols.index)

# scaled_numeric_df = scaled_numeric_df
# Concatenate the ID fields back with the numeric columns
numeric_df = pd.concat([id_fields, scaled_numeric_df], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numeric_cols.replace('', np.nan, inplace=True)


In [29]:
len(scaled_numeric_df.columns)

4

In [53]:
from tensorflow.keras.optimizers import Adam

# Set the dimensions and compile the models
latent_dim = 100
sequence_length = len(keep_cols)  # Adjust based on your sequence length

generator = build_generator(latent_dim, sequence_length)
discriminator = build_discriminator(sequence_length)
gan = build_gan(generator, discriminator)

# Set the initial learning rate
initial_learning_rate = 0.000001

# Create optimizers for the generator and discriminator
generator_optimizer = Adam(learning_rate=initial_learning_rate, beta_1=0.5)
discriminator_optimizer = Adam(learning_rate=initial_learning_rate, beta_1=0.5)
gan_optimizer = Adam(learning_rate=initial_learning_rate, beta_1=0.5)

# Compile the discriminator
generator.compile(optimizer=generator_optimizer, loss='mse', metrics=['accuracy'])

# Compile the discriminator
discriminator.compile(optimizer=discriminator_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Compile the GAN
gan.compile(optimizer=gan_optimizer, loss='mse')

# # Assuming 'final_df' is your scaled dataset with numeric columns and ID fields
# # Extract only the numeric columns for the GAN
# numeric_columns = [col for col in scaled_numeric_cols.columns if scaled_numeric_cols[col].dtype in [np.float32, np.float64]]
# real_sequences_df = scaled_numeric_cols[numeric_columns]

# Convert the DataFrame to a NumPy array
real_sequences_array = scaled_numeric_cols

def real_sequence_generator(data, batch_size):
    while True:
        # Shuffle the data at the beginning of each epoch
        np.random.shuffle(data)
        for i in range(0, len(data), batch_size):
            batch = data[i:i + batch_size]
            # If the batch is smaller than the batch size, pad it with samples from the beginning
            if len(batch) < batch_size:
                padding = data[:(batch_size - len(batch))]
                batch = np.concatenate([batch, padding], axis=0)
            yield batch


# Create an instance of the generator
# real_sequence_gen = real_sequence_generator(real_sequences_array, batch_size)


In [54]:
# Split the data into training and validation sets
from sklearn.model_selection import train_test_split

train_sequences, val_sequences = train_test_split(real_sequences_array, test_size=0.2, random_state=42)

test_sequences, val_sequences = train_test_split(val_sequences, test_size=0.5, random_state=42)

batch_size = 128  # Set the batch size


# Define the training and validation generators
train_gen = real_sequence_generator(train_sequences, batch_size)
test_gen = real_sequence_generator(test_sequences, batch_size)
val_gen = real_sequence_generator(val_sequences, batch_size)


In [55]:
match_indices = keep_cols[:(len(keep_cols)-1)]
match_indices

[31, 32, 33]

In [None]:
# Train the GAN with validation
import matplotlib.pyplot as plt

train_gan(generator, discriminator, gan, epochs=800, batch_size=128, latent_dim=latent_dim, val_gen=val_gen,keep_cols=keep_cols)

Epoch: 1/800, D Loss: [0.6880215  0.53515625], G Loss: 0.23880115151405334, Val D Loss: [0.68720967 0.5546875 ]
1/8 [==>...........................] - ETA: 0s

In [None]:
# data = validate_generator()
data = validate_generator(n_samp=1024*32)
x_values = [ datum[0] for datum in data]
y_values = [ datum[1] for datum in data]

# Create a scatter plot
plt.scatter(x_values, y_values, color='blue')

# Add labels and title
plt.xlabel('Sample Index')
plt.ylabel('Difference between Real and Generated')
plt.title('Scatter Plot of Validation Differences')

# Show the plot
plt.show()

In [None]:
most_similar