In [None]:
import pandas as pd

data_complex = pd.read_csv('/content/unprocessed_data/Complex Sentences.csv')
data_complex.rename(columns={'Complex Sentences': 'Sentences'}, inplace=True)
data_complex['sentence_type'] = 'complex'
data_complex.head()

Unnamed: 0,Sentences,sentence_type
0,"Although it was raining, we decided to go for ...",complex
1,"When the sun sets, the sky turns a beautiful s...",complex
2,"Because she studied hard, she passed the exam ...",complex
3,"If you want to succeed, you must be willing to...",complex
4,"While the children played outside, the adults ...",complex


In [None]:
data_compound = pd.read_csv('/content/unprocessed_data/Compound_sentences.csv')
data_compound.rename(columns={'Compound Sentences': 'Sentences'}, inplace=True)
data_compound['sentence_type'] = 'compound'
data_compound.head()

Unnamed: 0,Sentences,sentence_type
0,"The musician practiced every day, for he wante...",compound
1,"She stayed late at the office, for she had a m...",compound
2,"He saved diligently, for he dreamed of buying ...",compound
3,"The athlete trained hard, for the championship...",compound
4,"The teacher prepared detailed lesson plans, fo...",compound


In [None]:
data_compound_complex = pd.read_csv('/content/unprocessed_data/Complex_Compound_Sentences.csv')
data_compound_complex.rename(columns={'Compound-Complex Sentences': 'Sentences'}, inplace=True)
data_compound_complex['sentence_type'] = 'compound-complex'
data_compound_complex.head()

Unnamed: 0,Sentences,sentence_type
0,"If you need help moving, just call Mark, and h...",compound-complex
1,"I cherish our friendship, but it can sometimes...",compound-complex
2,"We planned a surprise party for Emma, so she w...",compound-complex
3,"Since Mia moved to another city, we catch up e...",compound-complex
4,"While I packed my bags, I realized I forgot my...",compound-complex


In [None]:
# Load JSON data into a DataFrame
data_simple = pd.read_json('/content/Simple_Sentences.json')
data_simple.rename(columns={'sentence': 'Sentences'}, inplace=True)
data_simple['sentence_type'] = 'simple'
data_simple.head()

Unnamed: 0,Sentences,sentence_type
0,She runs every morning.,simple
1,The kids and their parents built a treehouse a...,simple
10,The tourist and guide visited the museum and a...,simple
100,The singer and guitarist performed the song an...,simple
101,Isabella loves gardening.,simple


In [None]:
# Combine the dataframes into one
dataset = pd.concat([data_complex, data_compound, data_compound_complex, data_simple])

# Randomize the contents of the combined dataframe
dataset = dataset.sample(frac=1).reset_index(drop=True)

# Display the first few rows of the randomized dataframe
dataset.head()

Unnamed: 0,Sentences,sentence_type
0,"When the sun set, we lit a fire at the beach i...",complex
1,"When the meeting started at the CDC, everyone ...",compound-complex
2,"While painting in the park, I lost track of ti...",compound-complex
3,"She was nervous about the interview, yet she a...",compound
4,You can inquire if you have questions.,complex


In [None]:
dataset.value_counts('sentence_type')

Unnamed: 0_level_0,count
sentence_type,Unnamed: 1_level_1
complex,1094
compound-complex,1060
compound,1008
simple,1000


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset

# Step 2: Preprocess Data
# Encode the labels (Imperative, Exclamatory, Declarative, Interrogative) into integers
label_mapping = {'simple': 0, 'compound': 1, 'complex': 2, 'compound-complex': 3}
dataset['label'] = dataset['sentence_type'].map(label_mapping)

# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(dataset['Sentences'], dataset['label'], test_size=0.2)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
# Step 3: Define Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels.tolist())
val_dataset = CustomDataset(val_encodings, val_labels.tolist())

# Step 4: Define Compute Metrics Function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'accuracy': accuracy_score(p.label_ids, preds)}

# Step 5: Fine-Tune BERT Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_mapping))  # Adjust num_labels based on unique labels

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Pass the compute metrics function to the trainer
)

# Step 6: Train the Model
trainer.train()

# Step 7: Evaluate the Model
eval_results = trainer.evaluate()
print(f"Validation Accuracy: {eval_results['eval_accuracy']}")
print(f"Validation Loss: {eval_results['eval_loss']}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.003769,0.9988
2,0.356500,0.011911,0.996399
3,0.006000,0.000737,1.0


Validation Accuracy: 1.0
Validation Loss: 0.0007370041566900909


In [None]:
output_dir = '/content/sentence_structure_model'  # Specify the directory where you want to save the model
trainer.save_model(output_dir)

In [None]:
import shutil
shutil.make_archive('sentence_structure_model', 'zip', '/content/sentence_structure_model')
from google.colab import files
files.download('sentence_structure_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Function to predict the label for a given input
def predict(input_text):
    # Step 2: Tokenize the input text
    inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True, max_length=512)

    # Move inputs to the same device as the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Step 3: Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).item()  # Get the predicted label

    # Step 4: Map the prediction to the actual label
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}
    predicted_label = reverse_label_mapping[preds]

    return predicted_label

# Test with your own input
input_text = "It is during our darkest moments that we must focus to see the light."
predicted_label = predict(input_text)
print(f"Predicted Label: {predicted_label}")

Predicted Label: complex
