In [None]:
import pdfplumber

def pdf_to_text(pdf_path, txt_path):
    try:
        # Open the PDF file
        with pdfplumber.open(pdf_path) as pdf:
            # Create or open the text file to write the content
            with open(txt_path, 'w', encoding='utf-8') as txt_file:
                # Iterate through all pages
                for page_num, page in enumerate(pdf.pages):
                    # Extract text from each page
                    text = page.extract_text()
                    
                    # Write the text to the file if it exists
                    if text:
                        txt_file.write(f'--- Page {page_num + 1} ---\n')
                        txt_file.write(text)
                        txt_file.write('\n')  # Add a new line after each page
                
        print(f"Successfully converted '{pdf_path}' to '{txt_path}'")
    except Exception as e:
        print(f"Error: {e}")

# Example usage:
pdf_path = '../../data/static/intl_mil_ops_in_21st_cent.pdf'  # Replace with your PDF file path
txt_path = 'output.txt'   # Replace with your desired output text file path
pdf_to_text(pdf_path, txt_path)


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Save the fine-tuned model and tokenizer
model.save_pretrained("./retrained_gpt2_model")
tokenizer.save_pretrained("./retrained_gpt2_tokenizer")


In [None]:
# Set padding token
tokenizer.pad_token = tokenizer.eos_token

# Load and preprocess your dataset
dataset = load_dataset('text', data_files={'train': '../../data/static/artofwar.txt'})

def preprocess_function(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
    tokenized['labels'] = tokenized['input_ids']
    return tokenized

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10_000,
    output_dir='./results'
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train']
)

# Start training
trainer.train()


In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset

# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Load and preprocess your dataset
dataset = load_dataset('text', data_files={'train': 'output.txt'})

def preprocess_function(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)
    tokenized['labels'] = tokenized['input_ids']
    return tokenized

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10_000,
    output_dir='./results'
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train']
)

# Start training
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./retrained_gpt2_model")
tokenizer.save_pretrained("./retrained_gpt2_tokenizer")


In [None]:
# Load the fine-tuned model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("./retrained_gpt2_tokenizer")
model = GPT2LMHeadModel.from_pretrained("./retrained_gpt2_model")

# Add a PAD token if needed
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    

In [None]:

# Define mission details
available_vehicles = "foot, land vehicle, helicopter, boat"
starting_location = "lat: 34.0522, long: -118.2437"  # Example coordinates for Los Angeles
target_location = "lat: 36.1699, long: -115.1398"    # Example coordinates for Las Vegas
total_personnel = 10
target_time_on_objective = "2 hours"
strategy = "stealth"
objective = "infiltrate target"
expected_resistance = "high"

prompt = (
    f"Generate a detailed mission plan based on the following details:\n"
    f"- Available vehicles: {available_vehicles}\n"
    f"- Starting location: {starting_location}\n"
    f"- Target location: {target_location}\n"
    f"- Total personnel: {total_personnel}\n"
    f"- Target time on objective: {target_time_on_objective}\n"
    f"- Strategy: {strategy}\n"
    f"- Objective: {objective}\n"
    f"- Expected resistance: {expected_resistance}\n"
    "\nplan:"
)

# Tokenize the input text and set attention mask
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

# Generate output with more detailed configuration
outputs = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_new_tokens=300,               # Increase the token length
    do_sample=True,                   # Enable sampling
    temperature=0.65,                 # Lower temperature for more coherent text
    top_p=0.85,                       # Nucleus sampling (slightly lower for more focus)
    repetition_penalty=1.2,           # Add a penalty to discourage repetition
)


print(tokenizer.decode(outputs[0], skip_special_tokens=True))
