In [None]:
import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
import re

import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import optuna

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
df.head()

In [None]:
test.head()

# Basic Cleaning

In [None]:
df['text'] = df['text'].apply(lambda x: " ".join([word.lower() for word in str(x).split()]))
test['text'] = test['text'].apply(lambda x: " ".join([word.lower() for word in str(x).split()]))

def clean(tweet): 
            
    # Special characters
    tweet = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", tweet)
    
    Special = '@#!?+&*[]-%:/()$=><|{}^' 
    for s in Special:
        tweet = tweet.replace(s, "")
        
    return tweet

df['text'] = df['text'].apply(lambda s : clean(s))
test['text'] = test['text'].apply(lambda s : clean(s))

# BERT embedding

In [None]:
%%time

from transformers import AutoModel, BertTokenizerFast, AutoTokenizer
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    sentence_embedding = outputs.last_hidden_state[:, 0, :]  
    return sentence_embedding

df["text_emb"] = df["text"].apply(get_sentence_embedding)
test["text_emb"] = test["text"].apply(get_sentence_embedding)
df["flattened_embedding"] = df["text_emb"].apply(lambda x: x.flatten().numpy())

# XGBoost Classification

In [None]:
import xgboost as xgb

X = pd.DataFrame(df["flattened_embedding"].to_list())
y = df['target']  

model = xgb.XGBClassifier()
model.fit(X, y)

# 2D representation

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

# Plot the 2D t-SNE embeddings with different colors for different labels
plt.figure(figsize=(8, 6))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
plt.colorbar()
plt.title("t-SNE Visualization of BERT Embeddings")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.show()

# Evaluation 

In [None]:
ypred_train = model.predict(X)
print('Train:')
tn, fp, fn, tp = confusion_matrix(y, ypred_train).ravel()
print('tn, fp, fn, tp', tn, fp, fn, tp)
specificity = 1- (tn / (tn+fp))
print('1- specificity', specificity)
print(classification_report(y, ypred_train))

# Tuning using Optuna

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.1),
        "subsample": trial.suggest_float("subsample", 0.5, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 0.9),
        "gamma": trial.suggest_float("gamma", 0, 5),
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

best_params = study.best_params
print("Best Hyperparameters:", best_params)

final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

In [None]:
ypred_train = final_model.predict(X)
print('Train:')
tn, fp, fn, tp = confusion_matrix(y, ypred_train).ravel()
print('tn, fp, fn, tp', tn, fp, fn, tp)
specificity = 1- (tn / (tn+fp))
print('1- specificity', specificity)
print(classification_report(y, ypred_train))

In [None]:
test["flattened_embedding"] = test["text_emb"].apply(lambda x: x.flatten().numpy())
X_test = pd.DataFrame(test["flattened_embedding"].to_list())
ypred_test = final_model.predict(X_test)

In [None]:
submission = test[['id']]
submission['target'] = ypred_test

In [None]:
submission.to_csv("submission.csv", index=False)