In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim



In [2]:
### TFIDF

In [3]:
# Load training data
df_domain1 = pd.read_json('domain1_train_data.json', lines=True)
df_domain2 = pd.read_json('domain2_train_data.json', lines=True)
df_combined = pd.concat([df_domain1, df_domain2], ignore_index=True)

# Convert tokenized text to string format for TF-IDF
df_combined['text_str'] = df_combined['text'].apply(lambda x: ' '.join([str(token) for token in x]))

# Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=10000)
X_tfidf = vectorizer.fit_transform(df_combined['text_str']).toarray()
y = df_combined['label'].values

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)



In [5]:
# Define the model
class DenseClassifier(nn.Module):
    def __init__(self, input_dim):
        super(DenseClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, 2)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x



In [6]:
# Initialize the model, loss, and optimizer
model = DenseClassifier(input_dim=10000)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert arrays to PyTorch tensors and create dataloaders
train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.LongTensor(y_train))
val_dataset = TensorDataset(torch.FloatTensor(X_val), torch.LongTensor(y_val))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)



In [7]:
# Training the model
epochs = 5
for epoch in range(epochs):
    model.train()
    for data, target in train_loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')



Epoch 1/5, Loss: 0.2982042133808136
Epoch 2/5, Loss: 0.2791811525821686
Epoch 3/5, Loss: 0.20221228897571564
Epoch 4/5, Loss: 0.049922168254852295
Epoch 5/5, Loss: 0.017867418006062508


In [8]:
# Load test data
df_test = pd.read_json('test_data.json', lines=True)  # Update the path accordingly
df_test['text_str'] = df_test['text'].apply(lambda x: ' '.join([str(token) for token in x]))
X_test_tfidf = vectorizer.transform(df_test['text_str']).toarray()

# Make predictions
test_dataset = TensorDataset(torch.FloatTensor(X_test_tfidf))
test_loader = DataLoader(test_dataset, batch_size=64)

predictions = []
model.eval()
with torch.no_grad():
    for data in test_loader:
        data = data[0]  # Extract tensor from the DataLoader batch
        output = model(data)
        _, predicted = torch.max(output, 1)
        predictions.extend(predicted.tolist())

# Saving predictions
df_test['predicted_label'] = predictions
df_test[['predicted_label']].to_csv('test_predictions.csv', index=False)



In [9]:
# Method 2

In [29]:
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from torch.nn.utils.rnn import pad_sequence

In [30]:
# Load data
def load_data():
    domain1_df = pd.read_json('domain1_train_data.json', lines=True)
    domain2_df = pd.read_json('domain2_train_data.json', lines=True)
    # Combine and label data
    combined_df = pd.concat([domain1_df.assign(label=1), domain2_df.assign(label=0)])
    return combined_df

combined_df = load_data()

In [31]:
class CustomDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [33]:
# Preprocessing and Data Loader
def preprocess_and_load(data, split_ratio=0.8):
    sequences = pad_sequence([torch.tensor(x) for x in data['text']], batch_first=True, padding_value=0)
    labels = torch.tensor(data['label'].values)
    
    # Splitting the dataset
    dataset_size = len(data)
    train_size = int(split_ratio * dataset_size)
    val_size = dataset_size - train_size
    
    dataset = CustomDataset(sequences, labels)
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)
    return train_loader, val_loader

train_loader, val_loader = preprocess_and_load(combined_df)

In [43]:
class Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Linear(input_size, hidden_size)
        self.decoder = nn.Linear(hidden_size, input_size)

    def forward(self, x):
        encoded = torch.relu(self.encoder(x))
        decoded = self.decoder(encoded)
        return decoded


In [44]:
# Assuming a fixed input size for simplification; adjust based on your data
input_size = max(len(x) for x in combined_df['text'])  # Adjust based on actual max sequence length
hidden_size = 128  # Example size, adjust as needed

model = Autoencoder(input_size, hidden_size)
criterion = nn.MSELoss()  # Switched to MSELoss
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [45]:
def train(model, train_loader, criterion, optimizer, epochs=5):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for sequences, _ in train_loader:  # Labels are not used in autoencoding
            sequences = sequences.float()  # Ensure sequences are floats
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, sequences)  # Compare output to input
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}')


In [46]:
train(model, train_loader, criterion, optimizer)


Epoch 1/5, Average Loss: 6023378.4103
Epoch 2/5, Average Loss: 5055552.0472
Epoch 3/5, Average Loss: 4748511.1292
Epoch 4/5, Average Loss: 4590816.1868
Epoch 5/5, Average Loss: 4526051.5033


In [47]:
def calculate_reconstruction_error(model, dataloader):
    model.eval()
    errors = []
    with torch.no_grad():
        for sequences, _ in dataloader:
            sequences = sequences.float()
            outputs = model(sequences)
            loss = ((sequences - outputs) ** 2).mean(dim=1)
            errors.extend(loss.tolist())
    return errors

validation_errors = calculate_reconstruction_error(model, val_loader)


In [48]:
# Example: Using the median as a simple threshold
threshold = np.median(validation_errors)
print("Threshold for classification:", threshold)


Threshold for classification: 849738.0


In [49]:
y_pred = [1 if error > threshold else 0 for error in validation_errors]
y_true = [label.item() for _, label in val_dataset]

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='binary')
roc_auc = roc_auc_score(y_true, y_pred)

print(f"Accuracy: {accuracy}\nF1 Score: {f1}\nROC AUC: {roc_auc}")


Accuracy: 0.49944444444444447
F1 Score: 0.3582621082621083
ROC AUC: 0.49931106701940037


In [54]:
from torch.nn.utils.rnn import pad_sequence

def pad_sequences(sequences, max_len=None):
    if not max_len:
        max_len = max(len(seq) for seq in sequences)
    sequences_padded = pad_sequence([torch.tensor(seq)[:max_len] for seq in sequences], batch_first=True, padding_value=0)
    return sequences_padded

# Determine max length from your training/validation data
max_len = max(len(seq) for seq in combined_df['text'])

# Apply padding to the test sequences
test_sequences_padded = pad_sequences(test_df['text'].tolist(), max_len=max_len)
test_dataset = CustomDataset(test_sequences_padded, torch.zeros(len(test_sequences_padded)))  # Dummy labels for test data
test_loader = DataLoader(test_dataset, batch_size=32)

# test_errors = calculate_reconstruction_error(model, test_loader)
# test_predictions = [1 if error > threshold else 0 for error in test_errors]

In [None]:
# Ensemble

In [6]:
from sklearn.ensemble import BaggingClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd


In [7]:
# Load training data
df_domain1 = pd.read_json('domain1_train_data.json', lines=True)
df_domain2 = pd.read_json('domain2_train_data.json', lines=True)
df_combined = pd.concat([df_domain1, df_domain2], ignore_index=True)

# Convert tokenized text to string format for TF-IDF
df_combined['text_str'] = df_combined['text'].apply(lambda x: ' '.join([str(token) for token in x]))

# Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(df_combined['text_str']).toarray()
y = df_combined['label'].values

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



In [8]:
# Initialize models for bagging, stacking, and boosting
bagging_cls = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
boosting_cls = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)
stacking_cls = StackingClassifier(estimators=[('dt', DecisionTreeClassifier(random_state=42)), 
                                              ('svc', SVC(probability=True, random_state=42)), 
                                              ('lr', LogisticRegression(random_state=42)), 
                                              ('knn', KNeighborsClassifier())], final_estimator=RandomForestClassifier(n_estimators=100, random_state=42), cv=5)


In [9]:
# Train models
bagging_cls.fit(X_train, y_train)
boosting_cls.fit(X_train, y_train)
stacking_cls.fit(X_train, y_train)


In [10]:
# Make predictions on the validation set
pred_bagging = bagging_cls.predict(X_val)
pred_boosting = boosting_cls.predict(X_val)
pred_stacking = stacking_cls.predict(X_val)


In [11]:
# Evaluate models
print("Bagging Accuracy:", accuracy_score(y_val, pred_bagging))
print("Boosting Accuracy:", accuracy_score(y_val, pred_boosting))
print("Stacking Accuracy:", accuracy_score(y_val, pred_stacking))


Bagging Accuracy: 0.8347222222222223
Boosting Accuracy: 0.8002777777777778
Stacking Accuracy: 0.8755555555555555


In [20]:
print("Bagging F1:", f1_score(y_val, pred_bagging))
print("Boosting F1:", f1_score(y_val, pred_boosting))
print("Stacking F1:", f1_score(y_val, pred_stacking))


Bagging F1: 0.5311268715524035
Boosting F1: 0.4573584905660377
Stacking F1: 0.7210460772104608


In [21]:
print("Bagging ROC-AUC:", roc_auc_score(y_val, pred_bagging))
print("Boosting ROC-AUC:", roc_auc_score(y_val, pred_boosting))
print("Stacking ROC-AUC:", roc_auc_score(y_val, pred_stacking))

Bagging ROC-AUC: 0.6873364753931417
Boosting ROC-AUC: 0.6499522603269217
Stacking ROC-AUC: 0.8222096687050217


In [15]:
# Load test data
df_test = pd.read_json('test_data.json', lines=True)  # Update the path accordingly
df_test['text_str'] = df_test['text'].apply(lambda x: ' '.join([str(token) for token in x]))
X_test = vectorizer.transform(df_test['text_str']).toarray()


In [16]:
# Make predictions on test data
test_pred_bagging = bagging_cls.predict(X_test)
test_pred_boosting = boosting_cls.predict(X_test)
test_pred_stacking = stacking_cls.predict(X_test)


In [18]:
# Majority voting
final_predictions = np.round((test_pred_bagging + test_pred_boosting + test_pred_stacking) / 3).astype(int)

# Save to CSV
df_test['Predicted_Label'] = final_predictions
df_test.to_csv('final_predictions.csv', index=False)

In [19]:
# Save to CSV
df_test['class'] = test_pred_stacking
df_test.to_csv('stacking_predictions.csv', index=False)

In [None]:
# Stacking

In [35]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [23]:
# Base learners with a diverse set
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svc', SVC(probability=True, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
]

# Meta-learner
meta_learner = LogisticRegression()

# Stacking classifier
stacking_cls = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5)



In [24]:
# Hyperparameter tuning for the meta-learner
param_grid = {
    'final_estimator__C': [0.1, 1.0, 10.0],
    'stack_method': ['auto', 'predict_proba']
}

# Grid search
grid = GridSearchCV(estimator=stacking_cls, param_grid=param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best cross-validation score:", grid.best_score_)

KeyboardInterrupt: 

In [36]:
# Define models
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
svc = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto', probability=True))

# Ensemble: Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb), ('svc', svc)],
    voting='soft'
)

# Train models
for clf in (rf, gb, svc, voting_clf):
    clf_name = clf.__class__.__name__
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{clf_name} accuracy: {scores.mean():.2f} (+/- {scores.std() * 2:.2f})")

# Train final model on full training data and make predictions on test data
voting_clf.fit(X_train, y_train)
test_pred = voting_clf.predict(X_test)

# Save to CSV
submission_df = pd.DataFrame({'predicted_label': test_pred})
submission_df.to_csv('test_predictions.csv', index=False)

RandomForestClassifier accuracy: 0.83 (+/- 0.01)
GradientBoostingClassifier accuracy: 0.82 (+/- 0.01)
Pipeline accuracy: 0.83 (+/- 0.00)
VotingClassifier accuracy: 0.84 (+/- 0.01)


NameError: name 'y_test' is not defined