In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv
/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV



In [3]:
train_data = pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv",encoding= 'ISO-8859-1')
test_data = pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv",encoding= 'ISO-8859-1')

print('Train data shape',train_data.shape)
print('Test data shape',test_data.shape)

Train data shape (41157, 6)
Test data shape (3798, 6)


In [4]:
train_data.isnull().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [5]:
test_data.isnull().sum()

UserName           0
ScreenName         0
Location         834
TweetAt            0
OriginalTweet      0
Sentiment          0
dtype: int64

In [7]:
train_data = train_data[['OriginalTweet','Sentiment']]
test_data = test_data[['OriginalTweet','Sentiment']]

In [8]:
train_data['Sentiment'].nunique()

5

There are 5 unique types of tweets.

In [9]:
label_encoder = LabelEncoder()
train_data['Sentiment'] = label_encoder.fit_transform(train_data['Sentiment'])
test_data['Sentiment'] = label_encoder.transform(test_data['Sentiment'])

Extremely Negative : 0, Extremely Positive : 1, Negative : 2, Neutral : 3, Positive : 4

In [10]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Define a function for text preprocessing
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming (optional)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    # Rejoin tokens into a processed text
    processed_text = ' '.join(stemmed_tokens)
    
    return processed_text

# Apply text preprocessing to the 'OriginalTweet' column
train_data['ProcessedTweet'] = train_data['OriginalTweet'].apply(preprocess_text)
test_data['ProcessedTweet'] = test_data['OriginalTweet'].apply(preprocess_text)

# Train and test sets
X_train = train_data['ProcessedTweet'] 
X_test = test_data['ProcessedTweet'] 
y_train = train_data['Sentiment'] 
y_test = test_data['Sentiment']

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a classifier (e.g., Multinomial Naive Bayes)
clf = MultinomialNB()

# Define a range of hyperparameter values to search
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0],  # Range of alpha values to try
    'fit_prior': [True, False]  # Whether to use prior probabilities
}

# Create a GridSearchCV object with cross-validation (e.g., 5-fold cross-validation)
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV to the training data
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameter values from the grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best hyperparameters to train the final model
best_clf = MultinomialNB(alpha=best_params['alpha'], fit_prior=best_params['fit_prior'])
best_clf.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = best_clf.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print classification report
print(classification_report(y_test, y_pred))

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Best Hyperparameters: {'alpha': 2.0, 'fit_prior': False}
Accuracy: 48.05%
              precision    recall  f1-score   support

           0       0.51      0.50      0.51       592
           1       0.56      0.54      0.55       599
           2       0.45      0.45      0.45      1041
           3       0.52      0.55      0.53       619
           4       0.42      0.43      0.42       947

    accuracy                           0.48      3798
   macro avg       0.49      0.49      0.49      3798
weighted avg       0.48      0.48      0.48      3798



In [13]:
# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess your data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(train_data['OriginalTweet']), truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(list(test_data['OriginalTweet']), truncation=True, padding=True, max_length=128, return_tensors='pt')

train_labels = torch.tensor(list(train_data['Sentiment']))
test_labels = torch.tensor(list(test_data['Sentiment']))

# Create data loaders
batch_size = 8
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Load and configure the model for training
num_classes = 5
model_name = 'bert-base-uncased'

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
model.to(device)  # Move the model to the GPU if available

# Define optimizer and loss
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)  # Move data to GPU
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(loss)
        
# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)  # Move data to GPU
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy on the test set: {accuracy * 100:.2f}%')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor(0.5384, device='cuda:0', grad_fn=<NllLossBackward0>)
Accuracy on the test set: 83.62%
