In [48]:
import json

# Path to your .jsonl file
file_path = '/Users/kowsalya/Downloads/admin.jsonl'

data_list = []

# Read the first line of the file
with open(file_path, 'r') as file:
    for line in file:  # This iterates over each line until the end of the file
        line = line.strip()
        if line:  # Check if line is not empty
            line_data = json.loads(line) # Parse the JSON content
            data_list.append(line_data)

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

# Function to tokenize essay into sentences with start and end char offsets
def sentence_tokenize_with_offsets(text):
    sentences = sent_tokenize(text)
    offsets = []
    start = 0
    for sentence in sentences:
        start = text.find(sentence, start)
        end = start + len(sentence)
        offsets.append((start, end))
        start = end
    return sentences, offsets

# Function to tokenize sentences into words with their global char offsets
def word_tokenize_with_global_offsets(sentences, sentence_offsets, text):
    global_word_offsets = []
    for sentence, (sent_start, sent_end) in zip(sentences, sentence_offsets):
        words = word_tokenize(sentence)
        start = sent_start
        for word in words:
            # Find the start of the word within the essay
            start = text.find(word, start)
            end = start + len(word)
            global_word_offsets.append((start, end))
            start = end
    return global_word_offsets

# Apply labels based on character offsets to tokens
def apply_labels(tokens_offsets, labels):
    tagged_tokens = []
    for start, end in tokens_offsets:
        label = "O"  # Default label
        for l_start, l_end, l_label in labels:
            if start >= l_start and end <= l_end:
                if start == l_start:
                    label = "B-" + l_label
                else:
                    label = "I-" + l_label
                break
        tagged_tokens.append(label)
    return tagged_tokens

In [2]:
def extract_features(prepared_data, index):
    """
    Extract features for a token at the given index in the prepared data.
    
    :param prepared_data: List of tuples [(token, pos, iob), ...]
    :param index: Index of the current token in the prepared data
    :return: Dictionary of features
    """
    # Current token, POS tag, and IOB tag
    token, pos_tag, _ = prepared_data[index]
    
    # Features from the current token
    features = {
        'bias': 1.0,
        'word.lower()': token.lower(),
        'word.isupper()': token.isupper(),
        'word.istitle()': token.istitle(),
        'word.isdigit()': token.isdigit(),
        'pos': pos_tag,  # Current POS tag
    }
    
    # Features from the previous token
    if index > 0:
        prev_token, prev_pos_tag, _ = prepared_data[index - 1]
        features.update({
            '-1:word.lower()': prev_token.lower(),
            '-1:word.isupper()': prev_token.isupper(),
            '-1:word.istitle()': prev_token.istitle(),
            '-1:pos': prev_pos_tag,
        })
    else:
        # Indicate that it's the start of a sentence/document
        features['BOS'] = True

    # Features from the next token
    if index < len(prepared_data) - 1:
        next_token, next_pos_tag, _ = prepared_data[index + 1]
        features.update({
            '+1:word.lower()': next_token.lower(),
            '+1:word.isupper()': next_token.isupper(),
            '+1:word.istitle()': next_token.istitle(),
            '+1:pos': next_pos_tag,
        })
    else:
        # Indicate that it's the end of a sentence/document
        features['EOS'] = True

    return features

In [51]:
all_prepared_data = []  # This will hold the prepared_data for all documents

for data in data_list:
    # Process the essay
    sentences, sentence_offsets = sentence_tokenize_with_offsets(data['text'])
    tokens_offsets = word_tokenize_with_global_offsets(sentences, sentence_offsets, data['text'])
    labels = apply_labels(tokens_offsets, data['label'])

    # Assuming you now want to combine tokens, POS tags, and IOB tags
    tokens = [data['text'][start:end] for start, end in tokens_offsets]
    pos_tags = pos_tag(tokens)
    prepared_data = [(token, pos, label) for ((token, pos), label) in zip(pos_tags, labels)]
    all_prepared_data.append(prepared_data)

In [52]:
all_featuresets = []  # This will hold the featuresets for all documents

for prepared_data in all_prepared_data:
    featuresets = []
    for i in range(len(prepared_data)):
        features = extract_features(prepared_data, i)
        label = prepared_data[i][2]  # The IOB tag is the third element in the tuple
        featuresets.append((features, label))
    all_featuresets.append(featuresets)

In [53]:
import random

# Shuffle the data to ensure random distribution
random.shuffle(all_featuresets)

# Calculate split indices
total_documents = len(all_featuresets)
train_end = int(total_documents * 0.6)
validation_end = train_end + int(total_documents * 0.2)

# Split the data
train_data = all_featuresets[:train_end]
validation_data = all_featuresets[train_end:validation_end]
test_data = all_featuresets[validation_end:]

In [None]:
from nltk.classify import MaxentClassifier

# Flatten the training data if it's a list of lists
train_data_flat = [item for sublist in train_data for item in sublist]

# Train the model
classifier = MaxentClassifier.train(train_data_flat, 'IIS', trace=0, max_iter=10)


In [1]:
# Flatten the validation and test data if they're lists of lists
validation_data_flat = [item for sublist in validation_data for item in sublist]
test_data_flat = [item for sublist in test_data for item in sublist]

# Evaluate on the validation set
validation_accuracy = nltk.classify.accuracy(classifier, validation_data_flat)
print(f"Validation Accuracy: {validation_accuracy}")

# Evaluate on the test set
test_accuracy = nltk.classify.accuracy(classifier, test_data_flat)
print(f"Test Accuracy: {test_accuracy}")

Validation Accuracy: 0.4088644609303081
Test Accuracy: 0.417397962055959


In [None]:
from nltk.classify import MaxentClassifier
import nltk

# Assuming train_set and validation_set are already defined
performance_records = {}

# Range of `max_iter` values to try
max_iter_options = [10, 50, 100, 200]

for max_iter in max_iter_options:
    print(f"Training model with max_iter={max_iter}")
    classifier = MaxentClassifier.train(train_set, 'IIS', trace=0, max_iter=max_iter)
    
    # Evaluate on the validation set
    validation_accuracy = nltk.classify.accuracy(classifier, validation_set)
    print(f"Validation Accuracy for max_iter={max_iter}: {validation_accuracy}")
    
    # Record the performance
    performance_records[max_iter] = validation_accuracy

# Identify the best `max_iter` based on validation performance
best_max_iter = max(performance_records, key=performance_records.get)
best_accuracy = performance_records[best_max_iter]
print(f"Best max_iter: {best_max_iter} with Validation Accuracy: {best_accuracy}")

# Retrain the model with the best `max_iter`
best_classifier = MaxentClassifier.train(train_set, 'IIS', trace=0, max_iter=best_max_iter)
