# Environment Setup

In [None]:
! pip install transformers
! pip install torch


# Libraries


In [34]:
import os
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling, AutoModel


In [26]:
import zipfile
import os

def extract_zip_file(zip_file_path, extract_to_folder):

    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_folder)
    print(f"Extracted files to {extract_to_folder}")

# Example usage
zip_file_path = '/content/job_applications.zip'  # Update this path as needed
extract_to_folder = '/content/job_applications'
extract_zip_file(zip_file_path, extract_to_folder)


Extracted files to /content/job_applications


In [27]:
def load_and_merge_text_files(folder_path, delimiter="\n---\n"):

    all_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                text = file.read().strip()
                all_texts.append(text)
    merged_text = delimiter.join(all_texts)
    return merged_text

# Example usage
extracted_folder = '/content/job_applications'
merged_dataset = load_and_merge_text_files(extracted_folder)


In [28]:
def save_merged_dataset(merged_text, output_file_path):

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write(merged_text)
    print(f"Saved merged dataset to {output_file_path}")

# Example usage
output_file_path = '/content/merged_job_applications.txt'
save_merged_dataset(merged_dataset, output_file_path)


Saved merged dataset to /content/merged_job_applications.txt


In [29]:
from transformers import TextDataset, GPT2Tokenizer

def load_training_dataset(tokenizer, file_path, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset

# Example usage
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
training_dataset = load_training_dataset(tokenizer, output_file_path)




In [36]:
import os
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, GPT2Tokenizer

def fine_tune_gpt2_model(model_name, train_dataset, output_dir="./gpt2_finetuned"):
    # Disable wandb logging
    os.environ["WANDB_DISABLED"] = "true"

    # Load the model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    # Prepare the data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=2,
        save_steps=500,
        save_total_limit=2,
        prediction_loss_only=True,
        report_to="none"  # Disable reporting to wandb or any other tracking tool
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset
    )

    # Fine-tune the model
    trainer.train()

    # Save the fine-tuned model
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model fine-tuned and saved at {output_dir}")


In [37]:
model_name = "gpt2"
fine_tune_gpt2_model(model_name, training_dataset)




Step,Training Loss


Model fine-tuned and saved at ./gpt2_finetuned


In [38]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Define the path to the fine-tuned model directory
model_path = "./gpt2_finetuned"

# Load the fine-tuned model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

def generate_email(prompt, max_length=200):
    # Encode the input prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

    # Generate text with adjusted parameters
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,  # Generate only one email
            no_repeat_ngram_size=3,  # Avoid repeated phrases
            do_sample=True,
            top_k=40,    # Limit token selection to top-k options
            top_p=0.75,  # Narrow down token probability selection
            temperature=0.6,  # Lower temperature for less randomness
            pad_token_id=tokenizer.eos_token_id,
            early_stopping=True
        )

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Post-processing to clean up the generated text
    lines = [line.strip() for line in generated_text.split("\n") if line.strip()]
    cleaned_text = "\n\n".join(lines[:6])  # Limit the output to a reasonable length

    return cleaned_text


# Generating New text

In [46]:
prompt_text = "Subject: Application for the Machine Learning Position"
generated_email = generate_email(prompt_text, max_length=250)

print(f"Generated Email:\n\n\n{generated_email}")


Generated Email:


Subject: Application for the Machine Learning Position

Dear Hiring Manager,

I am applying for the Computer Science position at Microsoft Research. I am a passionate computer science student and have developed a strong analytical skills that have led me to a successful career in data analysis and machine learning. I believe that my analytical skills and analytical skills are valuable to my team, and I look forward to your consideration.

Sincerely,

Michael H. Williams

(555) 567-2389
