In [2]:
!pip install -q transformers datasets torch flask pyngrok

In [5]:
# Import libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from flask import Flask, request, render_template_string
from pyngrok import ngrok

In [6]:
# Step 2: Load the dataset
# Using 'tweet_eval' dataset with 'hate' subset from Hugging Face.
# Labels: 0 - non-hate, 1 - hate
dataset = load_dataset("tweet_eval", "hate")

# For demo purposes, use a small subset to speed up training
train_dataset = dataset['train'].shuffle(seed=42).select(range(2000))  # 2000 samples for training
eval_dataset = dataset['validation'].shuffle(seed=42).select(range(500))  # 500 for evaluation

In [21]:
# Step 3: Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Preprocess function
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)

# Tokenize datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

# Step 4: Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,  # Adjust epochs as needed
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",  # Disable logging to external services
)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [22]:
# Define compute metrics function
def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score
    logits, labels = eval_pred
    predictions = torch.argmax(torch.from_numpy(logits), dim=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Step 5: Train the model
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.537113,0.712
2,No log,0.588254,0.75
3,No log,0.844259,0.746
4,0.287800,1.571392,0.702
5,0.287800,1.415574,0.738
6,0.287800,1.990414,0.706
7,0.287800,2.06246,0.722
8,0.022300,2.164899,0.718
9,0.022300,2.281201,0.73
10,0.022300,2.353287,0.716


TrainOutput(global_step=2500, training_loss=0.06230036727376282, metrics={'train_runtime': 1093.213, 'train_samples_per_second': 36.589, 'train_steps_per_second': 2.287, 'total_flos': 1891110710400000.0, 'train_loss': 0.06230036727376282, 'epoch': 20.0})

In [24]:
# Step 6: Save the model
model_save_path = "/content/drive/MyDrive/HATE-Application/hate_speech_bert"
trainer.save_model(model_save_path)
print(f"Model saved to {model_save_path}")

# Step 7: Deploy with Flask
# Create Flask app
app = Flask(__name__)

# Load the saved model and tokenizer for inference
inference_model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
inference_tokenizer = AutoTokenizer.from_pretrained(model_save_path)
inference_model.eval()  # Set to evaluation mode

Model saved to /content/drive/MyDrive/HATE-Application/hate_speech_bert


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [26]:
# Step 7: Deploy with Flask
from flask import Flask, request, render_template_string
from pyngrok import ngrok
import torch

app = Flask(__name__)

# Load the saved model and tokenizer for inference
inference_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/HATE-Application/hate_speech_bert")
inference_tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/HATE-Application/hate_speech_bert")
inference_model.eval()

HTML_TEMPLATE = '''
<!doctype html>
<html>
<head>
    <title>Hate Speech Detector</title>
    <style>
        body {
            background-color: #f0f0f0; /* Light gray background; change to any color, e.g., lightblue, #87CEEB, etc. */
            font-family: Arial, sans-serif; /* Optional: improves readability */
            margin: 20px; /* Optional: adds padding around content */
        }
        h1 {
            text-align: center; /* Centers the title */
            color: #333; /* Optional: darker text color for contrast */
        }
        form {
            display: flex;
            flex-direction: column;
            align-items: center; /* Centers the form content */
        }
        textarea {
            width: 100%;
            max-width: 500px; /* Limits textarea width for better appearance */
            margin-bottom: 10px; /* Space between textarea and button */
        }
        input[type="submit"] {
            padding: 10px 20px;
            background-color: #4CAF50; /* Green button; change as desired */
            color: white;
            border: none;
            cursor: pointer;
        }
        input[type="submit"]:hover {
            background-color: #45a049; /* Darker green on hover */
        }
        h2 {
            text-align: center; /* Centers the prediction output */
            color: #333;
        }
    </style>
</head>
<body>
    <h1>Hate Speech Detector</h1>
    <form method="POST">
        <textarea name="text" placeholder="Enter tweet text here" rows="4" cols="50"></textarea><br>
        <input type="submit" value="Predict">
    </form>
    {% if prediction %}
    <h2>Prediction: {{ prediction }}</h2>
    {% endif %}
</body>
</html>
'''

@app.route('/', methods=['GET', 'POST'])
def predict():
    prediction = None
    if request.method == 'POST':
        text = request.form['text']
        inputs = inference_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = inference_model(**inputs)
        logits = outputs.logits
        pred_label = torch.argmax(logits, dim=1).item()
        prediction = "Hate Speech" if pred_label == 1 else "Non-Hate Speech"
    return render_template_string(HTML_TEMPLATE, prediction=prediction)

# Step 8: Set up ngrok and Flask
ngrok.set_auth_token("319KtWC2OCOIMXztshFk3qRIofd_2ubbePQhUaEvZCAqznCLR")  # Replace with your ngrok token
ngrok.kill()  # Ensure no stale tunnels
public_url = ngrok.connect(5000)
print(f"Flask app is running at: {public_url}")

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)

Flask app is running at: NgrokTunnel: "https://384d5c9ca8b8.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [11/Aug/2025 18:15:26] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [11/Aug/2025 18:15:27] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
