In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Function to parse each line of the data
def parse_line(line):
    parts = line.strip().split()
    label, concentration = map(float, parts[0].split(';'))
    features = [float(part.split(':')[1]) for part in parts[1:]]
    return [label, concentration] + features

# Data loading and preprocessing
data_directory = '/kaggle/input/gas-sensor-array-drift-dataset'
data_files = [f for f in os.listdir(data_directory) if f.endswith('.dat')]
data = []

for file_name in data_files:
    file_path = os.path.join(data_directory, file_name)
    with open(file_path, 'r') as file:
        for line in file:
            data.append(parse_line(line))

# Convert data to DataFrame
column_names = ['label', 'concentration'] + [f'feature_{i}' for i in range(1, 129)]
data_frame = pd.DataFrame(data, columns=column_names)

# Splitting the dataset into training and testing sets
X = data_frame.iloc[:, 2:]  # Features
y = data_frame.iloc[:, 0]   # Labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert labels to one-hot encoding for the neural network
num_classes = y.nunique()
y_train_encoded = to_categorical(y_train.astype(int) - 1)
y_test_encoded = to_categorical(y_test.astype(int) - 1)

# Define the neural network model
nn_model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile the neural network model
nn_model.compile(optimizer=Adam(learning_rate=0.001),
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])

# Train the neural network model
nn_model.fit(X_train_scaled, y_train_encoded, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

# Define the Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)
gb_model.fit(X_train_scaled, y_train)

# Combine both models into a voting classifier
class HybridModel:
    def __init__(self, nn_model, gb_model):
        self.nn_model = nn_model
        self.gb_model = gb_model

    def predict(self, X):
        nn_pred = np.argmax(self.nn_model.predict(X), axis=1) + 1  # Neural network predictions
        gb_pred = self.gb_model.predict(X)  # Gradient boosting predictions
        # Combine predictions using majority vote
        final_pred = np.array([np.argmax(np.bincount([nn, gb])) for nn, gb in zip(nn_pred, gb_pred)])
        return final_pred

hybrid_model = HybridModel(nn_model, gb_model)

# Make predictions on the test set using the hybrid model
y_pred = hybrid_model.predict(X_test_scaled)

# Print classification report
print(classification_report(y_test, y_pred))

# Perform PCA
pca = PCA(n_components=2)
X_test_pca = pca.fit_transform(X_test_scaled)

# Create a function to plot and save PCA results
def plot_pca(X_pca, y, y_pred, title, filename):
    plt.figure(figsize=(12, 10))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.7)
    plt.colorbar(scatter, label='Batch')
    plt.title(title)
    plt.xlabel('PCA1')
    plt.ylabel('PCA2')
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

# Plot and save original data
plot_pca(X_test_pca, y_test, None, 'PCA of Original Test Data (All Batches)', 'pca_original_all_batches.png')

# Plot and save predicted data
plot_pca(X_test_pca, y_pred, None, 'PCA of Predicted Test Data (All Batches)', 'pca_predicted_all_batches.png')

# Create a confusion matrix
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
plt.figure(figsize=(12, 10))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.close()

print("Figures have been saved: pca_original_all_batches.png, pca_predicted_all_batches.png, and confusion_matrix.png")


2024-07-15 07:19:46.302366: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-15 07:19:46.302491: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-15 07:19:46.448697: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch 1/20
[1m 83/331[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.2577 - loss: 2.2518

I0000 00:00:1721028006.357181     100 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 20ms/step - accuracy: 0.4220 - loss: 1.6749 - val_accuracy: 0.8543 - val_loss: 0.5188
Epoch 2/20
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7369 - loss: 0.7324 - val_accuracy: 0.9047 - val_loss: 0.3206
Epoch 3/20
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8151 - loss: 0.5438 - val_accuracy: 0.9618 - val_loss: 0.2124
Epoch 4/20
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8568 - loss: 0.4398 - val_accuracy: 0.9610 - val_loss: 0.1729
Epoch 5/20
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8848 - loss: 0.3640 - val_accuracy: 0.9735 - val_loss: 0.1352
Epoch 6/20
[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8989 - loss: 0.3192 - val_accuracy: 0.9686 - val_loss: 0.1401
Epoch 7/20
[1m331/331[0m [32m━━━━━