In [None]:
import pickle
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk
import os

nltk.download('punkt')

input_directory = '/content/training_data'
output_directory = '/content/new_training_data'

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Process each .pkl file in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.pkl'):
        input_path = os.path.join(input_directory, filename)

        # Load the .pkl file
        with open(input_path, 'rb') as f:
            data = pickle.load(f)

        # Initialize lists to store the individual sentences and their types
        sentences = []
        types = []

        # Process each row in the dataframe
        for index, row in data.iterrows():
            # Tokenize the content into sentences using nltk
            split_sentences = sent_tokenize(row['content'])

            # Expand the sentence_type list
            sentence_types = row['sentence_type']

            # Add sentences and their corresponding types to the lists
            for sentence, sentence_type in zip(split_sentences, sentence_types):
                sentences.append(sentence)
                types.append(sentence_type)

        # Create a new dataframe
        new_df = pd.DataFrame({
            'sentence': sentences,
            'sentence_type': types
        })

        # Save the new dataframe as a .pkl file
        output_filename = filename.replace('.pkl', '_modified.pkl')
        output_path = os.path.join(output_directory, output_filename)

        with open(output_path, 'wb') as f:
            pickle.dump(new_df, f)

        print(f"Processed and saved {output_filename}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Processed and saved Sentence Type Training Dataset 4 - October 6, 1996 Presidential Debate Transcript_modified.pkl
Processed and saved Sentence Type Training Dataset 2 - October 13, 2004 Presidential Debate Transcript_modified.pkl
Processed and saved Sentence Type Training Dataset 1 - October 19, 1992 Presidential Debate Transcript_modified.pkl
Processed and saved Sentence Type Training Dataset 3 - October 9, 2016 Presidential Debate Transcript_modified.pkl


In [None]:
import os
import pandas as pd
import pickle
import numpy as np
from sklearn.metrics import accuracy_score
# Step 1: Load and Combine Dataset with Diagnostics
def load_pickles(directory):
    dataframes = []
    total_rows = 0
    for filename in os.listdir(directory):
        if filename.endswith(".pkl"):
            filepath = os.path.join(directory, filename)
            try:
                with open(filepath, 'rb') as file:
                    df = pickle.load(file)
                    if not isinstance(df, pd.DataFrame):
                        print(f"Warning: {filename} did not contain a DataFrame.")
                        continue
                    dataframes.append(df)
                    total_rows += len(df)
                    print(f"Loaded {filename} with {len(df)} rows.")
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"Combined dataframe has {len(combined_df)} rows.")
    return combined_df

data_directory = '/content/new_training_data'
dataset = load_pickles(data_directory)

# Check the first few rows to verify the data
dataset

Loaded Sentence Type Training Dataset 1 - October 19, 1992 Presidential Debate Transcript_modified.pkl with 1111 rows.
Loaded Sentence Type Training Dataset 3 - October 9, 2016 Presidential Debate Transcript_modified.pkl with 1234 rows.
Loaded Sentence Type Training Dataset 4 - October 6, 1996 Presidential Debate Transcript_modified.pkl with 1097 rows.
Loaded Sentence Type Training Dataset 2 - October 13, 2004 Presidential Debate Transcript_modified.pkl with 1079 rows.
Combined dataframe has 4521 rows.


Unnamed: 0,sentence,sentence_type
0,Good evening.,declarative
1,Welcome to this third and final debate among t...,declarative
2,"Governor Bill Clinton, the Democratic nominee,...",declarative
3,(APPLAUSE) I am Jim Lehrer of the MacNeil-Lehr...,declarative
4,"I will be the moderator for this debate, which...",declarative
...,...,...
4516,I want to wish both of you the very best of lu...,declarative
4517,That's it for us from Arizona State University...,declarative
4518,I'm Bob Schieffer at CBS News.,declarative
4519,"Goodnight, everyone.",declarative


In [None]:
imperatives = pd.read_csv('/content/Imperative Sentences.csv')
imperatives = imperatives.rename(columns={'Imperative Sentences': 'sentence'})
imperatives['sentence_type'] = 'imperative'
imperatives

Unnamed: 0,sentence,sentence_type
0,Please bring me a glass of water.,imperative
1,Don't forget to submit your assignment on time.,imperative
2,Turn off the lights when you leave.,imperative
3,Leave the building right now!,imperative
4,Consider your options carefully before making ...,imperative
...,...,...
994,Don’t forget to review the presentation before...,imperative
995,Remind to follow the schedule closely.,imperative
996,Meet at the entrance for the briefing.,imperative
997,Let know if additional resources are needed.,imperative


In [None]:
exclamatory = pd.read_csv('/content/Exclamatory Sentences.csv')
exclamatory = exclamatory.rename(columns={'Exclamatory Sentences': 'sentence'})
exclamatory['sentence_type'] = 'exclamatory'
exclamatory

Unnamed: 0,sentence,sentence_type
0,How incredible it is that she won the award!,exclamatory
1,Wow! They’ve done an outstanding job!,exclamatory
2,How could she let that happen!,exclamatory
3,Oh! Rachel and David are absolutely beaming wi...,exclamatory
4,What a fantastic idea Chloe had!,exclamatory
...,...,...
995,Oh! I can’t believe we missed the train!,exclamatory
996,Oh! They’re so proud of what they’ve accomplis...,exclamatory
997,What a beautiful gesture that was!,exclamatory
998,Wow! That’s the most delicious cake I’ve ever ...,exclamatory


In [None]:
interrogative = pd.read_csv('/content/interrogative_sentences.csv')
interrogative = interrogative.rename(columns={'question': 'sentence'})
interrogative['sentence_type'] = 'interrogative'
interrogative = interrogative[:5000]

In [None]:
dataset = pd.concat([dataset, imperatives, exclamatory, interrogative], ignore_index=True)
# Randomize the contents of the combined dataframe
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset

Unnamed: 0,sentence,sentence_type
0,Which area in Alaska is the nation's second hi...,interrogative
1,What a fantastic day we’ve had!,exclamatory
2,How exciting the atmosphere in Rio during Carn...,exclamatory
3,Oh! Sarah is so proud of what they accomplished!,exclamatory
4,Which village do 3rd and 4th Street extend into?,interrogative
...,...,...
28512,How many attacks comprised the Plymouth Blitz?,interrogative
28513,"Three weeks from now - 2 weeks from tomorrow, ...",declarative
28514,What group conquered southern France from Musl...,interrogative
28515,The Brooklyn Dodgers were created in what year?,interrogative


In [None]:
dataset.value_counts('sentence_type')

Unnamed: 0_level_0,count
sentence_type,Unnamed: 1_level_1
interrogative,16318
declarative,4143
imperative,4056
exclamatory,4000


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset

# Step 2: Preprocess Data
# Encode the labels (Imperative, Exclamatory, Declarative, Interrogative) into integers
label_mapping = {'imperative': 0, 'exclamatory': 1, 'declarative': 2, 'interrogative': 3}
dataset['label'] = dataset['sentence_type'].map(label_mapping)

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(dataset['sentence'], dataset['label'], test_size=0.2)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Step 3: Define Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels.tolist())
val_dataset = CustomDataset(val_encodings, val_labels.tolist())

# Step 4: Define Compute Metrics Function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'accuracy': accuracy_score(p.label_ids, preds)}

# Step 5: Fine-Tune BERT Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_mapping))  # Adjust num_labels based on unique labels

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Pass the compute metrics function to the trainer
)

# Step 6: Train the Model
trainer.train()

# Step 7: Evaluate the Model
eval_results = trainer.evaluate()
print(f"Validation Accuracy: {eval_results['eval_accuracy']}")
print(f"Validation Loss: {eval_results['eval_loss']}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.057487,0.987845
2,0.316000,0.048661,0.99116
3,0.053400,0.047381,0.99337


Validation Accuracy: 0.9933701657458563
Validation Loss: 0.04738080874085426


In [None]:
# Function to predict the label for a given input
def predict(input_text):
    # Step 2: Tokenize the input text
    inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True, max_length=512)

    # Move inputs to the same device as the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Step 3: Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).item()  # Get the predicted label

    # Step 4: Map the prediction to the actual label
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}
    predicted_label = reverse_label_mapping[preds]

    return predicted_label

# Test with your own input
input_text = "What time does the meeting start?"
predicted_label = predict(input_text)
print(f"Predicted Label: {predicted_label}")

NameError: name 'tokenizer' is not defined