Project Of RIO125 - Automate detection and recognition of grammatical error.
Devloped By :- Vishwajeet Vijay Bhosale

In [1]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
import nltk
nltk.download('punkt')


# Load the dataset
df = pd.read_csv("/content/sample_data/input_data.csv")  # Replace "your_dataset.csv" with the path to your dataset

# Data Preprocessing
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Apply preprocessing to the text column
df['processed_text'] = df['text'].apply(preprocess_text)

# Split the dataset into training, validation, and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

# Save preprocessed datasets
train_df.to_csv("train_dataset.csv", index=False)
val_df.to_csv("val_dataset.csv", index=False)
test_df.to_csv("test_dataset.csv", index=False)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load preprocessed datasets
train_df = pd.read_csv("train_dataset.csv")
val_df = pd.read_csv("val_dataset.csv")
test_df = pd.read_csv("test_dataset.csv")

# Handle missing values
train_df.dropna(subset=['processed_text'], inplace=True)
val_df.dropna(subset=['processed_text'], inplace=True)
test_df.dropna(subset=['processed_text'], inplace=True)

# Print the head of updated datasets
print("Head of Train Dataset:")
print(train_df.head())

print("\nHead of Validation Dataset:")
print(val_df.head())

print("\nHead of Test Dataset:")
print(test_df.head())

# Feature Engineering
def extract_features(df):
    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
    tfidf_features = tfidf_vectorizer.fit_transform(df['processed_text']).toarray()

    return tfidf_features

# Extract features for training, validation, and testing sets
train_features = extract_features(train_df)
val_features = extract_features(val_df)
test_features = extract_features(test_df)

# Save extracted features
pd.DataFrame(train_features).to_csv("train_features.csv", index=False, header=False)
pd.DataFrame(val_features).to_csv("val_features.csv", index=False, header=False)
pd.DataFrame(test_features).to_csv("test_features.csv", index=False, header=False)


Head of Train Dataset:
                                                text  label  \
0                 I'll write something about myself.      1   
1      To use English well, I register on this site.      1   
2  I don't know how much money it would cost, or ...      1   
3                 because it is a very good website.      1   
4  KanJi is japanese character and have variety o...      0   

                                      processed_text  
0                                ill write something  
1                     use english well register site  
2  dont know much money would cost much time woul...  
3                                       good website  
4           kanji japanese character variety meaning  

Head of Validation Dataset:
                                                text  label  \
0                               I'm college student.      0   
1                             be hired as consultant      0   
2                                It’s sunny mornin

In [3]:
import pandas as pd
from transformers import BertTokenizer

# Load preprocessed dataset
train_df = pd.read_csv("train_dataset.csv")
val_df = pd.read_csv("val_dataset.csv")
test_df = pd.read_csv("test_dataset.csv")

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text(df):
    # Drop rows with missing or empty 'processed_text'
    df = df.dropna(subset=['processed_text'])
    df = df[df['processed_text'] != '']

    # Tokenize the text data
    tokenized_data = tokenizer(df['processed_text'].tolist(), padding=True, truncation=True, return_tensors='pt')

    return tokenized_data

train_tokenized = tokenize_text(train_df)
val_tokenized = tokenize_text(val_df)
test_tokenized = tokenize_text(test_df)

# Print tokenized data shapes
print("Train Tokenized Shape:", train_tokenized['input_ids'].shape)
print("Validation Tokenized Shape:", val_tokenized['input_ids'].shape)
print("Test Tokenized Shape:", test_tokenized['input_ids'].shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Train Tokenized Shape: torch.Size([15989, 26])
Validation Tokenized Shape: torch.Size([1997, 23])
Test Tokenized Shape: torch.Size([2000, 20])


In [4]:
import torch
import torch.nn as nn
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)  # Adjust dropout probability as needed
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits


In [5]:
print("Input IDs Shape:", train_tokenized['input_ids'].shape)
print("Attention Mask Shape:", train_tokenized['attention_mask'].shape)
print("Labels Shape:", torch.tensor(train_df['label'].tolist()).shape)

Input IDs Shape: torch.Size([15989, 26])
Attention Mask Shape: torch.Size([15989, 26])
Labels Shape: torch.Size([15998])


In [6]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer

# Assuming train_df, val_df, and test_df are your DataFrames containing the data
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize text data
def tokenize_text(df, tokenizer):
    tokenized_data = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128, return_attention_mask=True)
    return tokenized_data

# Load the data into DataFrames using the correct file paths
train_df = pd.read_csv('train_dataset.csv')
val_df = pd.read_csv('val_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')


# Check the first few rows of each DataFrame
print("Sample of train_df:")
print(train_df.head())

print("\nSample of val_df:")
print(val_df.head())

print("\nSample of test_df:")
print(test_df.head())
# Tokenize text data for train, validation, and test sets
train_tokenized = tokenize_text(train_df, tokenizer)
val_tokenized = tokenize_text(val_df, tokenizer)
test_tokenized = tokenize_text(test_df, tokenizer)

# Filter the labels to match the number of input samples
filtered_labels_train = train_df['label'][:len(train_tokenized['input_ids'])]
filtered_labels_val = val_df['label'][:len(val_tokenized['input_ids'])]
filtered_labels_test = test_df['label'][:len(test_tokenized['input_ids'])]

# Create TensorDataset for train, validation, and test sets
train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'], torch.tensor(filtered_labels_train.tolist()))
val_dataset = TensorDataset(val_tokenized['input_ids'], val_tokenized['attention_mask'], torch.tensor(filtered_labels_val.tolist()))
test_dataset = TensorDataset(test_tokenized['input_ids'], test_tokenized['attention_mask'], torch.tensor(filtered_labels_test.tolist()))

# Create DataLoader for train, validation, and test sets
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


Sample of train_df:
                                                text  label  \
0                 I'll write something about myself.      1   
1      To use English well, I register on this site.      1   
2  I don't know how much money it would cost, or ...      1   
3                 because it is a very good website.      1   
4  KanJi is japanese character and have variety o...      0   

                                      processed_text  
0                                ill write something  
1                     use english well register site  
2  dont know much money would cost much time woul...  
3                                       good website  
4           kanji japanese character variety meaning  

Sample of val_df:
                                                text  label  \
0                               I'm college student.      0   
1                             be hired as consultant      0   
2                                It’s sunny morning.      0   


In [7]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize text data
def tokenize_text(df, tokenizer):
    tokenized_data = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128, return_attention_mask=True)
    return tokenized_data

# Tokenize text data for train and validation sets
train_tokenized = tokenize_text(train_df, tokenizer)
val_tokenized = tokenize_text(val_df, tokenizer)

# Filter the labels to match the number of input samples
filtered_labels_train = train_df['label'][:len(train_tokenized['input_ids'])]
filtered_labels_val = val_df['label'][:len(val_tokenized['input_ids'])]

# Create TensorDataset for train and validation sets
train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'], torch.tensor(filtered_labels_train.tolist()))
val_dataset = TensorDataset(val_tokenized['input_ids'], val_tokenized['attention_mask'], torch.tensor(filtered_labels_val.tolist()))

# Create DataLoader for train and validation sets
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Check the lengths of the datasets
print("Length of Train Dataset:", len(train_dataset))
print("Length of Validation Dataset:", len(val_dataset))


Length of Train Dataset: 15998
Length of Validation Dataset: 2000


In [8]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Check for GPU availability
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU instead")

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize text data
def tokenize_text(df, tokenizer):
    tokenized_data = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors='pt', max_length=128, return_attention_mask=True)
    return tokenized_data

# Load the data into DataFrames using the correct file paths
train_df = pd.read_csv('train_dataset.csv')
val_df = pd.read_csv('val_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')

# Tokenize text data for train, validation, and test sets
train_tokenized = tokenize_text(train_df, tokenizer)
val_tokenized = tokenize_text(val_df, tokenizer)
test_tokenized = tokenize_text(test_df, tokenizer)

# Filter the labels to match the number of input samples
filtered_labels_train = train_df['label'][:len(train_tokenized['input_ids'])]
filtered_labels_val = val_df['label'][:len(val_tokenized['input_ids'])]
filtered_labels_test = test_df['label'][:len(test_tokenized['input_ids'])]

# Create TensorDataset for train, validation, and test sets
train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'], torch.tensor(filtered_labels_train.tolist()))
val_dataset = TensorDataset(val_tokenized['input_ids'], val_tokenized['attention_mask'], torch.tensor(filtered_labels_val.tolist()))
test_dataset = TensorDataset(test_tokenized['input_ids'], test_tokenized['attention_mask'], torch.tensor(filtered_labels_test.tolist()))

# Create DataLoader for train, validation, and test sets
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Initialize BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Define the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Evaluate the model on the validation set
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {running_loss/len(train_loader)},  Validation Accuracy: {(correct/total)*100}%")


# Save the trained model
torch.save(model.state_dict(), 'bert_model.pth')

GPU is available


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Training Loss: 0.4483939927741885,  Validation Accuracy: 80.35%
Epoch 2/3, Training Loss: 0.24040161559917034,  Validation Accuracy: 81.89999999999999%
Epoch 3/3, Training Loss: 0.10999372425326147,  Validation Accuracy: 84.05%


In [9]:
import torch
from sklearn.metrics import classification_report, accuracy_score

# Function to evaluate the model
def evaluate_model(model, dataloader):

    model.eval()  # Set the model to evaluation mode
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)

            # Convert tensors to numpy arrays and extend lists
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_predictions, all_labels

# Evaluate the model on the validation set
val_predictions, val_labels = evaluate_model(model, val_loader)

# Calculate classification report
classification_rep = classification_report(val_labels, val_predictions)
print("Classification Report for Validation Set:")
print(classification_rep)

# Calculate accuracy
accuracy = accuracy_score(val_labels, val_predictions)
print("Accuracy on Validation Set:", accuracy)


Classification Report for Validation Set:
              precision    recall  f1-score   support

           0       0.86      0.82      0.84      1024
           1       0.82      0.86      0.84       976

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000

Accuracy on Validation Set: 0.8405


In [17]:
import pandas as pd
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from torch.utils.data import DataLoader

def classify_text_from_excel(input_file, output_file, model, tokenizer, device):
    # Read input data from Excel file
    df = pd.read_excel(input_file)

    # Select only the first 1000 rows
    df = df.head(1000)

    # Convert text column to a list of strings
    text_list = df['text'].astype(str).tolist()

    # Process text inputs
    tokenized_data = tokenizer(text_list, padding=True, truncation=True, return_tensors='pt', max_length=128, return_attention_mask=True)
    input_ids = tokenized_data['input_ids'].to(device)
    attention_mask = tokenized_data['attention_mask'].to(device)

    # Move model to device
    model.to(device)

    # Predict labels using the model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, axis=1).cpu().numpy()

    # Add predictions to DataFrame
    df['predicted_label'] = predictions

    # Save DataFrame with predictions back to CSV file
    df.to_csv(output_file, index=False)

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Input and output filenames
input_filename = '/content/sample_data/test_data.xlsx'
output_filename = 'output_file_only_labeled.csv'

# Call the function to classify text from Excel
classify_text_from_excel(input_filename, output_filename, model, tokenizer, device)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Initialize BERT tokenizer and load pretrained model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load('bert_model.pth'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Function to validate user sentence
def validate_sentence(sentence):
    # Tokenize the input sentence
    inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt').to(device)

    # Pass input through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Interpret predictions
    prediction = torch.argmax(outputs.logits).item()
    if prediction == 1:
        print("The sentence is grammatically correct.")
    else:
        print("The sentence contains grammar errors.")

# Get user input
user_sentence = input("Enter your sentence: ")

# Validate the user sentence
validate_sentence(user_sentence)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Enter your sentence: Jeff ran a mile and drops his keys.
The sentence contains grammar errors.


In [34]:
import pandas as pd
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from torch.utils.data import DataLoader
from nltk import pos_tag, word_tokenize

# Download NLTK data
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to validate sentence for grammatical errors
def validate_sentence(sentence):
    # Ensure that the input is a string
    if not isinstance(sentence, str):
        return 1, "None"  # No error

    # Tokenize the input sentence
    tokens = word_tokenize(sentence)

    # Get Part-of-Speech tags for the tokens
    pos_tags = pos_tag(tokens)

    # Check for grammatical errors
    error_type = None
    for i in range(len(pos_tags)-1):
        # Rule 1: Subject-Verb Agreement
        if pos_tags[i][1].startswith('N') and pos_tags[i+1][1].startswith('VB'):
            error_type = "Subject-Verb Agreement"
            return 0, error_type  # Error found

        # Rule 2: Present Simple Tense
        if pos_tags[i][1] == 'VBP' and pos_tags[i][0].endswith('s'):
            error_type = "Present Simple Tense"
            return 0, error_type  # Error found

        # Rule 3: Present Continuous Tense
        if pos_tags[i][1] == 'VBG' and pos_tags[i][0] != 'am':
            error_type = "Present Continuous Tense"
            return 0, error_type  # Error found

        # Rule 4: Present Perfect Tense
        if pos_tags[i][1] == 'VBN' and pos_tags[i][0] != 'been':
            error_type = "Present Perfect Tense"
            return 0, error_type  # Error found

        # Rule 5: Present Perfect Continuous Tense
        if pos_tags[i][1] == 'VBG' and pos_tags[i][0] == 'been':
            error_type = "Present Perfect Continuous Tense"
            return 0, error_type  # Error found

        # Rule 6: Past Simple Tense
        if pos_tags[i][1] == 'VBD' and not pos_tags[i][0].endswith('ed'):
            error_type = "Past Simple Tense"
            return 0, error_type  # Error found

        # Rule 7: Past Continuous Tense
        if pos_tags[i][1] == 'VBD' and pos_tags[i][0] == 'were':
            error_type = "Past Continuous Tense"
            return 0, error_type  # Error found

        # Rule 8: Past Perfect Tense
        if pos_tags[i][1] == 'VBN' and pos_tags[i][0] != 'had':
            error_type = "Past Perfect Tense"
            return 0, error_type  # Error found

        # Rule 9: Past Perfect Continuous Tense
        if pos_tags[i][1] == 'VBN' and pos_tags[i][0] == 'had':
            error_type = "Past Perfect Continuous Tense"
            return 0, error_type  # Error found

        # Rule 10: Future Simple Tense
        if pos_tags[i][1] == 'MD' and pos_tags[i][0] not in ['will', 'shall']:
            error_type = "Future Simple Tense"
            return 0, error_type  # Error found

        # Rule 11: Future Continuous Tense
        if pos_tags[i][1] == 'MD' and pos_tags[i][0] in ['will', 'shall']:
            error_type = "Future Continuous Tense"
            return 0, error_type  # Error found

        # Rule 12: Future Perfect Tense
        if pos_tags[i][1] == 'MD' and pos_tags[i][0] in ['will', 'shall']:
            error_type = "Future Perfect Tense"
            return 0, error_type  # Error found

        # Rule 13: Future Perfect Continuous Tense
        if pos_tags[i][1] == 'MD' and pos_tags[i][0] in ['will', 'shall']:
            error_type = "Future Perfect Continuous Tense"
            return 0, error_type  # Error found

    return 1, "None"  # No error

# Function to classify text from Excel file
def classify_text_from_excel(input_file, output_file, model, tokenizer, device):
    # Read input data from Excel file
    df = pd.read_excel(input_file)


    df = df.sample(n=1000, random_state=42)

    # Convert text column to a list of strings
    text_list = df['text'].astype(str).tolist()

    # Process text inputs
    tokenized_data = tokenizer(text_list, padding=True, truncation=True, return_tensors='pt', max_length=128, return_attention_mask=True)
    input_ids = tokenized_data['input_ids'].to(device)
    attention_mask = tokenized_data['attention_mask'].to(device)

    # Move model to device
    model.to(device)

    # Predict labels using the model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, axis=1).cpu().numpy()

    # Add columns for grammatical error detection and error type
    df['label'], df['error_type'] = zip(*df['text'].apply(validate_sentence))

    # Save DataFrame with predictions, error detection, and error type back to CSV file
    df.to_csv(output_file, index=False)

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Input and output filenames
input_filename = '/content/sample_data/test_data.xlsx'
output_filename = 'output_file.csv'

# Call the function to classify text from Excel
classify_text_from_excel(input_filename, output_filename, model, tokenizer, device)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
