In [3]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import pandas as pd
import numpy as np

# Set seeds for reproducibility
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Loading dataset
df = pd.read_csv('data.csv')

# Step 1: Preprocessing the data

# Encoding the 'tail' column (yes -> 1, no -> 0)
df['tail_encoded'] = df['tail'].map({'yes': 1, 'no': 0})

# Encoding the 'species' column using LabelEncoder
label_encoder = LabelEncoder()
df['species_encoded'] = label_encoder.fit_transform(df['species'])

# Standardizing the 'fingers' column
scaler = StandardScaler()
df['fingers_scaled'] = scaler.fit_transform(df[['fingers']])

# Converting 'message' column to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_features = tfidf_vectorizer.fit_transform(df['message']).toarray()

# Combining all features into a single DataFrame
X = pd.DataFrame(tfidf_features, columns=[f'tfidf_{i}' for i in range(tfidf_features.shape[1])])
X['tail_encoded'] = df['tail_encoded']
X['fingers_scaled'] = df['fingers_scaled']

# Target variable
y = df['species_encoded']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Train Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# BERT Model Definition
class CustomBERTModel(nn.Module):
    def __init__(self, bert_model_name, num_labels, dropout_rate):
        super(CustomBERTModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=num_labels)

        self.fc1 = nn.Linear(2, 16)  # 2 features: tail and fingers
        self.dropout_fc1 = nn.Dropout(p=dropout_rate)
        self.fc2 = nn.Linear(768 + 16, 256)  # 768 (BERT output) + 16 (tabular data output from fc1)
        self.dropout_fc2 = nn.Dropout(p=dropout_rate)
        self.fc3 = nn.Linear(256, num_labels)
        self.dropout_fc3 = nn.Dropout(p=dropout_rate)

    def forward(self, input_ids, attention_mask, tail_encoded, fingers_scaled):
        bert_outputs = self.bert.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_outputs[1]  # CLS token output (768 features)

        tabular_data = torch.cat((tail_encoded.unsqueeze(1), fingers_scaled.unsqueeze(1)), dim=1)
        tabular_features = torch.relu(self.fc1(tabular_data))
        tabular_features = self.dropout_fc1(tabular_features)

        combined = torch.cat((pooled_output, tabular_features), dim=1)  # Shape will be 768 (BERT) + 16 (tabular)
        combined = torch.relu(self.fc2(combined))  # Input to fc2 is now 768 + 16 = 784
        combined = self.dropout_fc2(combined)

        logits = self.fc3(combined)
        logits = self.dropout_fc3(logits)

        return logits

# Initialize BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = CustomBERTModel('bert-base-uncased', num_labels=len(df['species'].unique()), dropout_rate=0.3)

# Tokenize the text data for BERT
train_tokenized_inputs = tokenizer(
    df.loc[X_train.index, 'message'].tolist(),
    padding=True,
    truncation=True,
    max_length=32,
    return_tensors='pt'
)
test_tokenized_inputs = tokenizer(
    df.loc[X_test.index, 'message'].tolist(),
    padding=True,
    truncation=True,
    max_length=32,
    return_tensors='pt'
)

# Converting other features and labels to tensors
train_tail_encoded = torch.tensor(df.loc[X_train.index, 'tail_encoded'].tolist(), dtype=torch.float32)
train_fingers_scaled = torch.tensor(df.loc[X_train.index, 'fingers_scaled'].tolist(), dtype=torch.float32)
train_labels = torch.tensor(y_train.tolist(), dtype=torch.long)

test_tail_encoded = torch.tensor(df.loc[X_test.index, 'tail_encoded'].tolist(), dtype=torch.float32)
test_fingers_scaled = torch.tensor(df.loc[X_test.index, 'fingers_scaled'].tolist(), dtype=torch.float32)
test_labels = torch.tensor(y_test.tolist(), dtype=torch.long)

# Creating DataLoader for BERT
train_dataset = TensorDataset(
    train_tokenized_inputs['input_ids'], train_tokenized_inputs['attention_mask'],
    train_tail_encoded, train_fingers_scaled, train_labels
)
test_dataset = TensorDataset(
    test_tokenized_inputs['input_ids'], test_tokenized_inputs['attention_mask'],
    test_tail_encoded, test_fingers_scaled, test_labels
)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Training the BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
optimizer = Adam(bert_model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

num_epochs = 10
bert_model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, tail_encoded, fingers_scaled, labels = [x.to(device) for x in batch]
        optimizer.zero_grad()
        logits = bert_model(input_ids, attention_mask, tail_encoded, fingers_scaled)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

# Making predictions with BERT
bert_model.eval()
bert_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, tail_encoded, fingers_scaled, labels = [x.to(device) for x in batch]
        logits = bert_model(input_ids, attention_mask, tail_encoded, fingers_scaled)
        bert_preds.append(torch.argmax(logits, dim=-1).cpu().numpy())

bert_preds = np.concatenate(bert_preds)

# Step 4: Voting Classifier (BERT + Logistic Regression + Random Forest)
# We have predictions from BERT, Logistic Regression, and Random Forest

# Logistic Regression predictions
logreg_preds = logreg_model.predict(X_test)

# Random Forest predictions
rf_preds = rf_model.predict(X_test)

# Voting Function
def majority_vote(bert_preds, logreg_preds, rf_preds):
    preds = np.stack([bert_preds, logreg_preds, rf_preds], axis=1)
    final_preds = [np.bincount(row).argmax() for row in preds]
    return np.array(final_preds)

# Combining predictions via majority voting
final_preds = majority_vote(bert_preds, logreg_preds, rf_preds)

# Evaluating the final predictions
final_accuracy = accuracy_score(y_test, final_preds)
print(f"Final Ensemble Accuracy: {final_accuracy}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 2.2592720127105714
Epoch 2, Loss: 1.9062764167785644
Epoch 3, Loss: 1.6359659790992738
Epoch 4, Loss: 1.4986241936683655
Epoch 5, Loss: 1.448446787595749
Epoch 6, Loss: 1.3838802039623261
Epoch 7, Loss: 1.293114334344864
Epoch 8, Loss: 1.3277151918411254
Epoch 9, Loss: 1.2641692924499512
Epoch 10, Loss: 1.2238049006462097
Final Ensemble Accuracy: 0.89


In [4]:
# Loading the test dataset
new_df = pd.read_csv('/content/test.csv')

# Step 1: Preprocessing the new data
# Encoding the 'tail' column (yes -> 1, no -> 0)
new_df['tail_encoded'] = new_df['tail'].map({'yes': 1, 'no': 0})


# Standardizimg the 'fingers' column
new_df['fingers_scaled'] = scaler.transform(new_df[['fingers']])

# Converting 'message' column to TF-IDF features using the same TfidfVectorizer
new_tfidf_features = tfidf_vectorizer.transform(new_df['message']).toarray()

# Combining all features into a single DataFrame for the new data
X_new = pd.DataFrame(new_tfidf_features, columns=[f'tfidf_{i}' for i in range(new_tfidf_features.shape[1])])
X_new['tail_encoded'] = new_df['tail_encoded']
X_new['fingers_scaled'] = new_df['fingers_scaled']


In [5]:
# Logistic Regression predictions for the new data
logreg_new_preds = logreg_model.predict(X_new)

In [6]:
# Random Forest predictions for the new data
rf_new_preds = rf_model.predict(X_new)

In [7]:
# Tokenizing the text data for BERT
new_tokenized_inputs = tokenizer(
    new_df['message'].tolist(),
    padding=True,
    truncation=True,
    max_length=32,
    return_tensors='pt'
)

# Converting the other features (tail_encoded, fingers_scaled) to tensors
new_tail_encoded = torch.tensor(new_df['tail_encoded'].tolist(), dtype=torch.float32)
new_fingers_scaled = torch.tensor(new_df['fingers_scaled'].tolist(), dtype=torch.float32)

# Creating DataLoader for the new data
new_dataset = TensorDataset(
    new_tokenized_inputs['input_ids'], new_tokenized_inputs['attention_mask'],
    new_tail_encoded, new_fingers_scaled
)

new_loader = DataLoader(new_dataset, batch_size=8, shuffle=False)

# BERT predictions
bert_model.eval()
bert_new_preds = []
with torch.no_grad():
    for batch in new_loader:
        input_ids, attention_mask, tail_encoded, fingers_scaled = [x.to(device) for x in batch]
        logits = bert_model(input_ids, attention_mask, tail_encoded, fingers_scaled)
        bert_new_preds.append(torch.argmax(logits, dim=-1).cpu().numpy())

bert_new_preds = np.concatenate(bert_new_preds)

In [8]:
# Voting function (already defined)
def majority_vote(bert_preds, logreg_preds, rf_preds):
    preds = np.stack([bert_preds, logreg_preds, rf_preds], axis=1)
    final_preds = [np.bincount(row).argmax() for row in preds]
    return np.array(final_preds)

# Applying majority voting for the final predictions
final_new_preds = majority_vote(bert_new_preds, logreg_new_preds, rf_new_preds)

In [9]:
# Converting the integer predictions back to species names
final_new_species = label_encoder.inverse_transform(final_new_preds)

# Saving the predictions to a new CSV
new_df['predicted_species'] = final_new_species
new_df.to_csv('new_predictions.csv', index=False)

print("Predictions saved to 'new_predictions.csv'")

Predictions saved to 'new_predictions.csv'
