In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Load dataset
df = pd.read_csv("Tweets.csv")  # Update with correct file path

# Print available columns for debugging
print(f"Available columns: {df.columns.tolist()}")

# Automatically detect sentiment column
possible_sentiment_cols = ["airline_sentiment", "sentiment", "label"]
sentiment_col = next((col for col in possible_sentiment_cols if col in df.columns), None)

if sentiment_col:
    print(f"Using column '{sentiment_col}' for sentiment labels.")
    sentiment_mapping = {'positive': 1, 'negative': 0, 'neutral': 0}
    df[sentiment_col] = df[sentiment_col].astype(str).str.lower()
    y_train = df[sentiment_col].map(sentiment_mapping).tolist()
else:
    raise ValueError("No suitable sentiment column found in the dataset!")

if "text" in df.columns:
    df = df.dropna(subset=["text"])
    X_train = df["text"].tolist()
else:
    raise ValueError("No 'text' column found in the dataset!")

print(f"Loaded {len(X_train)} tweets successfully.")

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text.split()

tokenized_tweets = [preprocess_text(tweet) for tweet in X_train]

vocab = {word: idx for idx, word in enumerate(set(word for tweet in tokenized_tweets for word in tweet))}

sentiment_lexicon = {
    "good": 1, "great": 1, "excellent": 1, "happy": 1,
    "bad": -1, "terrible": -1, "sad": -1, "angry": -1
}

def symbolic_reasoning(tokens):
    score = sum(sentiment_lexicon.get(token, 0) for token in tokens)
    return 1 if score > 0 else 0

X_train_tokens = [[vocab[word] for word in tweet if word in vocab] for tweet in tokenized_tweets]
X_train_tokens, X_val_tokens, y_train, y_val = train_test_split(X_train_tokens, y_train, test_size=0.2, random_state=42)

class SentimentNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(SentimentNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.fc = nn.Linear(embed_size, hidden_size)
        self.output = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = self.embedding(x).mean(dim=1)
        x = F.relu(self.fc(x))
        x = torch.sigmoid(self.output(x))
        return x

class FusionAgent:
    def __init__(self, alpha=0.1, gamma=0.9):
        self.q_table = defaultdict(lambda: np.zeros(2))
        self.alpha = alpha
        self.gamma = gamma

    def choose_action(self, state):
        return np.argmax(self.q_table[state])

    def update(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state])
        self.q_table[state][action] += self.alpha * (reward + self.gamma * self.q_table[next_state][best_next_action] - self.q_table[state][action])

vocab_size = len(vocab)
embed_size = 8
hidden_size = 16
model = SentimentNN(vocab_size, embed_size, hidden_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
fusion_agent = FusionAgent()

for epoch in range(10):
    total_loss = 0
    for i, tokens in enumerate(X_train_tokens):
        if len(tokens) == 0:
            continue
        input_tensor = torch.tensor([tokens], dtype=torch.long)
        neural_output = model(input_tensor)
        neural_prediction = 1 if neural_output.item() > 0.5 else 0
        symbolic_prediction = symbolic_reasoning([list(vocab.keys())[t] for t in tokens])
        state = tuple(tokens)
        action = fusion_agent.choose_action(state)
        final_prediction = neural_prediction if action == 0 else symbolic_prediction
        reward = 1 if final_prediction == y_train[i] else -1
        fusion_agent.update(state, action, reward, tuple(tokens))
        target = torch.tensor([[y_train[i]]], dtype=torch.float)
        loss = criterion(neural_output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 2 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

print("Training complete.")

all_predictions = []
all_actual_labels = []
all_comments = []

for i, tokens in enumerate(X_val_tokens):
    if len(tokens) == 0:
        continue
    input_tensor = torch.tensor([tokens], dtype=torch.long)
    neural_output = model(input_tensor)
    neural_prediction = 1 if neural_output.item() > 0.5 else 0
    symbolic_prediction = symbolic_reasoning([list(vocab.keys())[t] for t in tokens])
    state = tuple(tokens)
    action = fusion_agent.choose_action(state)
    final_prediction = neural_prediction if action == 0 else symbolic_prediction
    all_predictions.append(final_prediction)
    all_actual_labels.append(y_val[i])
    all_comments.append(X_train[i])

accuracy = sum(p == a for p, a in zip(all_predictions, all_actual_labels)) / len(all_actual_labels)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

cm = confusion_matrix(all_actual_labels, all_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

positive_accuracy = sum(p == a for p, a in zip(all_predictions, all_actual_labels) if a == 1) / all_actual_labels.count(1) if all_actual_labels.count(1) > 0 else 0
negative_accuracy = sum(p == a for p, a in zip(all_predictions, all_actual_labels) if a == 0) / all_actual_labels.count(0) if all_actual_labels.count(0) > 0 else 0
categories = ['Positive', 'Negative']
accuracy_scores = [positive_accuracy, negative_accuracy]

plt.bar(categories, accuracy_scores, color=['green', 'red'])
plt.xlabel('Sentiment Category')
plt.ylabel('Accuracy')
plt.title('Accuracy per Sentiment Category')
plt.ylim([0, 1])
plt.show()


Available columns: ['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone']
Using column 'airline_sentiment' for sentiment labels.
Loaded 14640 tweets successfully.
Epoch 0, Loss: 3368.7166
