# TOPICS
-

# Step 1: Install dependencies and import libraries

In [8]:
# Install dependencies
!pip install tensorflow
import pandas as pd
import numpy as np
import nltk
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
import matplotlib.pyplot as plt

# Download punkt tokenizer for text cleaning
nltk.download('punkt')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Step 2: Load the dataset and clean the data

In [25]:
# Load the dataset
df = pd.read_csv('/content/ROTC_DATASET.csv')

# Data Cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = nltk.word_tokenize(text)  # Tokenize words
    text = [word for word in text if word.isalnum()]  # Remove non-alphanumeric tokens
    return ' '.join(text)

# Combine 'Pattern' and 'Attribute And Correlation' for input
df['Pattern'] = df['Pattern']

# Clean the 'Pattern' and 'Response' columns
df['Pattern'] = df['Pattern'].apply(clean_text)
df['Response'] = df['Response'].apply(clean_text)
df['Attribute'] = df['Attribute'].apply(clean_text)
df['Correlation'] = df['Correlation'].apply(clean_text)

# Remove fully duplicate rows
df = df.drop_duplicates()

# Handle duplicates in the 'Pattern' column by keeping the first occurrence
df = df.drop_duplicates(subset='Pattern', keep='first')

# Create a new column with formatted responses
df['Formatted_Response'] = df['Response'] + " | Attributes - " + df['Attribute'] + " | Correlation - " + df['Correlation']

# Save the cleaned dataset to a new file
df.to_csv('/content/ROTC_CLEANED_DATASET.csv', index=False)


# Step 3: Tokenize and pad the input data

In [14]:
# Tokenize the patterns
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Pattern'])
X = tokenizer.texts_to_sequences(df['Pattern'])

# Padding the sequences to ensure uniform input length
X_pad = pad_sequences(X, padding='post')

# Encode the responses as integers
y = np.array([i for i in range(len(df['Formatted_Response']))])

# Show tokenized input and padded sequences
X_pad[:5], y[:5]



(array([[  1,  11,   2, 152,   6,  10, 246,   7, 102,   0,   0,   0,   0,
           0,   0,   0],
        [  1,   3,   2,  57,   6,   7, 117,   0,   0,   0,   0,   0,   0,
           0,   0,   0],
        [  1,  11,   2, 424, 129,   4,   2,   7,   0,   0,   0,   0,   0,
           0,   0,   0],
        [ 19,  11, 129,  30,   4,   2,   7,   0,   0,   0,   0,   0,   0,
           0,   0,   0],
        [  1,   3,   7,  22,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0]], dtype=int32),
 array([0, 1, 2, 3, 4]))

# Step 4: Split the data into training and validation sets

In [15]:
# Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# Step 5: Build the LSTM model

In [20]:
# Check if a checkpoint exists and load the model if so
checkpoint_path = '/content/ROTC_chatbot_model_checkpoint.keras'
if os.path.exists(checkpoint_path):
    print("Checkpoint found! Loading the model...")
    model = load_model(checkpoint_path)  # Load the model from checkpoint
else:
    print("No checkpoint found. Creating a new model.")
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32, input_length=X_pad.shape[1]))
    model.add(LSTM(64, return_sequences=False))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(len(df['Formatted_Response']), activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Compile the model
model.summary()

No checkpoint found. Creating a new model.


# Step 6: Set up early stopping and model checkpoint callbacks

In [31]:
# Callbacks for early stopping and checkpoints
checkpoint_callback = ModelCheckpoint(
    checkpoint_path, save_best_only=True, monitor='loss', verbose=1
)

# Custom callback to stop training when accuracy reaches 0.95
class AccuracyThresholdCallback(Callback):
    def __init__(self, accuracy_threshold=0.98):
        super(AccuracyThresholdCallback, self).__init__()
        self.accuracy_threshold = accuracy_threshold

    def on_epoch_end(self, epoch, logs=None):
        accuracy = logs.get('accuracy')
        if accuracy is not None and accuracy >= self.accuracy_threshold:
            print(f"\nEpoch {epoch + 1}: Accuracy has reached {self.accuracy_threshold}, stopping training.")
            self.model.stop_training = True

# Initialize the callback
accuracy_threshold_callback = AccuracyThresholdCallback(accuracy_threshold=0.98)



# Step 7: Train the model

In [26]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100, batch_size=32, verbose=1,
    callbacks=[checkpoint_callback, accuracy_threshold_callback]
)


Epoch 1/100
[1m44/45[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.9318 - loss: 0.3412
Epoch 1: loss did not improve from 0.39280
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.9301 - loss: 0.3439 - val_accuracy: 0.0000e+00 - val_loss: 61.7286
Epoch 2/100
[1m42/45[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 13ms/step - accuracy: 0.8981 - loss: 0.3725
Epoch 2: loss did not improve from 0.39280
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8947 - loss: 0.3794 - val_accuracy: 0.0000e+00 - val_loss: 61.9257
Epoch 3/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8725 - loss: 0.4432
Epoch 3: loss did not improve from 0.39280
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8718 - loss: 0.4445 - val_accuracy: 0.0000e+00 - val_loss: 61.8383
Epoch 4/100
[1m44/45[0m [32m━━━━━━━━━━━━━━━

# Step 8: Save the trained model

In [27]:
# Save the final trained model
model.save('/content/ROTC_chatbot_model_final.keras')

# Step 9: Error analysis on misclassified data

In [28]:
# Step 9: Error analysis on misclassified data

# Predict classes for validation data
y_pred = np.argmax(model.predict(X_val), axis=1)

# Get the indices of the validation data (use range based on the number of rows in X_val)
val_indices = range(len(X_val))

# Create a DataFrame to compare the true and predicted labels
misclassified_data = pd.DataFrame({
    'True Label': y_val,
    'Predicted Label': y_pred,
    'Text': df['Pattern'].iloc[val_indices].values  # Access the corresponding text based on indices
})

# Find rows where the model's prediction is incorrect
misclassified_data = misclassified_data[misclassified_data['True Label'] != misclassified_data['Predicted Label']]

# Display some of the misclassified data
print("Misclassified Data Sample:")
print(misclassified_data.head(10))  # Display the first 10 misclassified examples


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Misclassified Data Sample:
   True Label  Predicted Label  \
0        1600              388   
1         936             1762   
2         483              951   
3        1352             1339   
4        1252              699   
5         544              238   
6        1235             1516   
7        1521              568   
8         631             1698   
9        1176             1279   

                                                Text  
0   what are the qualities of a good military leader  
1             what is the purpose of military drills  
2           what are the core values in the military  
3           why are values important in the military  
4                        what is military discipline  
5               why is military discipline important  
6      why should rotc cadets learn military customs  
7             how does leadership impact rotc cadets  
8           what are military

# Step 10: Adjust model weights and retrain

In [30]:
# Step 10: Adjustments: Calculate Class Weights and Retrain

# Calculate class weights to handle class imbalance
unique_classes = np.unique(y_train)
class_weights = compute_class_weight('balanced', classes=unique_classes, y=y_train)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# Retrain the model with class weights
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100, batch_size=32, verbose=1,
    callbacks=[checkpoint_callback, accuracy_threshold_callback],
    class_weight=class_weights_dict  # Apply class weights
)

# Save the adjusted model
model.save('/content/ROTC_chatbot_model_adjusted.keras')


Epoch 1/100
[1m43/45[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.9645 - loss: 0.2479
Epoch 1: loss improved from inf to 0.26808, saving model to /content/ROTC_chatbot_model_checkpoint.keras
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.9635 - loss: 0.2492 - val_accuracy: 0.0000e+00 - val_loss: 60.4911
Epoch 2/100
[1m43/45[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - accuracy: 0.9635 - loss: 0.2230
Epoch 2: loss did not improve from 0.26808
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9620 - loss: 0.2262 - val_accuracy: 0.0000e+00 - val_loss: 60.6624
Epoch 3/100
[1m44/45[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.9700 - loss: 0.2221
Epoch 3: loss did not improve from 0.26808
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9690 - loss: 0.2241 - val_accuracy: 0.0000e+00 - va

# Step 11: Function to predict responses


In [32]:
# Function to predict responses
def get_response(text):
    cleaned_text = clean_text(text)  # Ensure the text is preprocessed
    seq = tokenizer.texts_to_sequences([cleaned_text])
    padded = pad_sequences(seq, maxlen=X_pad.shape[1], padding='post')
    pred = model.predict(padded)
    response_idx = np.argmax(pred)
    # Extract formatted response (includes Attribute and Correlation)
    formatted_response = df['Formatted_Response'].iloc[response_idx]
    # Split into main response, attributes, and correlation
    main_response = formatted_response.split(" | ")[0]  # Main response (before 'Attributes')
    attributes = formatted_response.split(" | ")[1].replace("Attributes - ", "")  # Extract attributes
    correlation = formatted_response.split(" | ")[2].replace("Correlation - ", "")  # Extract correlation
    return f"Chatbot: {main_response}\nAttributes: {attributes}\nCorrelation: {correlation}"



# Step 13: Chatbot loop for testing

In [48]:
# Chatbot Loop
print("HELLO! ASK ME ABOUT ROTC KNOWLEDGE. TYPE 'EXIT' TO END THE CONVERSATION.")
while True:
    user_input = input("YOU: ")
    if user_input.lower() == 'exit':
        print("GOODBYE!")
        break
    response = get_response(user_input)
    print(response.upper())  # Convert response to uppercase


HELLO! ASK ME ABOUT ROTC KNOWLEDGE. TYPE 'EXIT' TO END THE CONVERSATION.
YOU: how did the philippine army contribute to history
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
CHATBOT: THE PHILIPPINE ARMY PLAYED A CRUCIAL ROLE IN DEFENDING THE NATION
ATTRIBUTES: PHILIPPINE ARMY
CORRELATION: FOUGHT IN MAJOR WARS AND BATTLES FOR THE COUNTRY S SOVEREIGNTY
YOU: what role does the philippine navy play in history
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
CHATBOT: THE NAVY PROTECTS THE COUNTRY WATERS AND MAINTAINS NATIONAL SECURITY
ATTRIBUTES: PHILIPPINE NAVY
CORRELATION: VITAL FOR MARITIME DEFENSE AND TERRITORIAL INTEGRITY
YOU: what is a combat helmet designed for
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
CHATBOT: A COMBAT HELMET PROTECTS THE HEAD FROM SHRAPNEL BULLETS AND BLUNT FORCE IMPACTS
ATTRIBUTES: WEAPONS AND EQUIPMENT
CORRELATION: HELMETS ARE ESSENTIAL FOR MINIMIZING HEAD INJURIES IN COMBAT SITUATIONS