In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
import torch
import yaml
from dotenv import load_dotenv
import chromadb
import pandas as pd
import os
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}.")

Using cuda.


# splits = {'train': 'data/train-00000-of-00001-b1700331af6d3576.parquet', 'test': 'data/test-00000-of-00001-460abe60f17dbc1c.parquet'}
# df = pd.read_parquet("hf://datasets/ShashiVish/cover-letter-dataset/" + splits["train"])

In [4]:
df = pd.read_csv("./dataset/cover_letter_dataset.csv")

In [5]:
df

Unnamed: 0,id,Job Title,Preferred Qualifications,Hiring Company,Applicant Name,Past Working Experience,Current Working Experience,Skillsets,Qualifications,Cover Letter
0,0,Senior Java Developer,5+ years of experience in Java Development,Google,John Doe,Java Developer at XYZ for 3 years,Senior Java Developer at ABC for 2 years,"Java, Spring Boot, Hibernate, SQL",BSc in Computer Science,I am writing to express my interest in the Sen...
1,1,Data Scientist,5 years of experience in data modeling,XYZ Analytics Solutions,John Smith,Data Analyst at ABC Corporation,Data Scientist at XYZ Technologies,"Data modeling, data analysis, programming (Pyt...","Bachelor's degree in Computer Science, Master'...","Dear Hiring Manager,\n\nI am writing to expres..."
2,2,Data Scientist,Experience with Python and proficiency in at l...,XYZ Analytics,John Smith,Data Analyst at ABC Corporation,Data Scientist at XYZ Solutions,"Python, R, SQL, Machine Learning, Statistical ...","Bachelor's degree in Computer Science, Master'...","Dear Hiring Manager,\n\nI am writing to expres..."
3,3,Senior Data Scientist,Minimum of 3 years validated experience\nOutst...,XYZ Tech Solutions,John Smith,Data Analyst at ABC Company,Senior Data Scientist at DEF Corporation,"Python, machine learning libraries, deep learn...","Bachelor's degree in Computer Science, Master'...","Dear Hiring Manager,\n\nI am writing to expres..."
4,4,data scientist,35 years+ professional experience in data sci...,xyz tech solutions,john smith,data analyst at abc company,data scientist at xyz tech solutions,"Python, R, Machine Learning, NLP, Data Visuali...",35 years+ professional experience in data sci...,"Dear Hiring Manager,\n\nI am writing to expres..."
...,...,...,...,...,...,...,...,...,...,...
808,808,Senior Support Engineer,"5+ years of customer support, ITIL",IBM,David Davis,Support Engineer at XYZ for 2 years,Senior Support Engineer at ABC for 3 years,"Customer Support, ITIL, Troubleshooting, SQL, ...",Bachelor's in Information Technology,I am applying for the Senior Support Engineer ...
809,809,Network Engineer,Experience with VoIP technologies and network ...,Apple,Edward Hernandez,Network Engineer at Google (5 years),Senior Network Engineer at Amazon (3 years),"VoIP Technologies, Network Security, Cisco, Ju...",BSc in Network Engineering,I am applying for the Network Engineer positio...
810,810,Mobile App Developer,Master's degree in Computer Science or related...,XYZ Company,Jennifer Wilson,5 years of mobile app development experience,4 years of mobile app development experience,"iOS, Android, Swift, Java, Kotlin","Excellent communication skills, ability to wor...","Dear Hiring Manager, I am writing to express m..."
811,811,Data Scientist,"University degree in engineering, data, or re...",XYZ Corporation,John Smith,Data Analyst at ABC Company,Data Scientist at DEF Company,"Data analysis, machine learning, programming (...","Bachelor's degree in Computer Science, PMP Cer...","Dear Hiring Manager,\n\nI am writing to expres..."


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 813 entries, 0 to 812
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   id                          813 non-null    int64 
 1   Job Title                   813 non-null    object
 2   Preferred Qualifications    811 non-null    object
 3   Hiring Company              813 non-null    object
 4   Applicant Name              813 non-null    object
 5   Past Working Experience     813 non-null    object
 6   Current Working Experience  812 non-null    object
 7   Skillsets                   806 non-null    object
 8   Qualifications              805 non-null    object
 9   Cover Letter                813 non-null    object
dtypes: int64(1), object(9)
memory usage: 63.6+ KB


In [7]:
df.describe()

Unnamed: 0,id
count,813.0
mean,406.0
std,234.837178
min,0.0
25%,203.0
50%,406.0
75%,609.0
max,812.0


In [8]:
df.isna().sum()

id                            0
Job Title                     0
Preferred Qualifications      2
Hiring Company                0
Applicant Name                0
Past Working Experience       0
Current Working Experience    1
Skillsets                     7
Qualifications                8
Cover Letter                  0
dtype: int64

In [9]:
import json

dataset_json = df.to_dict(orient="records")

In [10]:
try:
    if not os.path.exists("./dataset"):
        os.makedirs("./dataset")
    with open("dataset/cover_letter_dataset.json", "w") as f:
        json.dump(dataset_json, f, indent=4)
    print("Dataset saved as JSON.")
    
except FileNotFoundError as e:
    print(f"Error: {e}")

Dataset saved as JSON.


# Store to ChromaDB

In [11]:
import chromadb
import json

In [12]:
with open("./dataset/cover_letter_dataset.json", "r") as f:
    dataset = json.load(f)

In [13]:
chroma_client = chromadb.PersistentClient(path="./cover_letter_db")

In [14]:
collection = chroma_client.get_or_create_collection(name="cover_letters")

In [15]:
collection

Collection(name=cover_letters)

In [16]:
for i, entry in enumerate(dataset):
    collection.add(
        documents=[entry["Cover Letter"]],
        metadatas=[{
            "job_title" : entry["Job Title"],
            "preferred_qualifications" : entry["Preferred Qualifications"],
            "hiring_company" : entry["Hiring Company"],
            "past_working_exp" : entry["Past Working Experience"],
            "current_working_exp" : entry["Current Working Experience"],
            "skillsets" : entry["Skillsets"],
            "qualifications" : entry["Qualifications"]
        }],
        ids=[f"cover_letter_{i}"]
    )

print("Stored in ChromaDB.")

Add of existing embedding ID: cover_letter_122
Add of existing embedding ID: cover_letter_123
Add of existing embedding ID: cover_letter_124
Add of existing embedding ID: cover_letter_125
Add of existing embedding ID: cover_letter_126
Add of existing embedding ID: cover_letter_127
Add of existing embedding ID: cover_letter_128
Add of existing embedding ID: cover_letter_129
Add of existing embedding ID: cover_letter_130
Add of existing embedding ID: cover_letter_131
Add of existing embedding ID: cover_letter_132
Add of existing embedding ID: cover_letter_133
Add of existing embedding ID: cover_letter_134
Add of existing embedding ID: cover_letter_135
Add of existing embedding ID: cover_letter_136
Add of existing embedding ID: cover_letter_137
Add of existing embedding ID: cover_letter_138
Add of existing embedding ID: cover_letter_139
Add of existing embedding ID: cover_letter_140
Add of existing embedding ID: cover_letter_141
Add of existing embedding ID: cover_letter_142
Add of existi

Stored in ChromaDB.


# Relevant Cover Letters

In [17]:
def retrieve_cover_letter(job_title, company):
    results = collection.query(
        query_texts = [f"{job_title} at {company}"],
        n_results=1
    )

    if results["documents"]:
        return results["documents"][0][0]

    else:
        return None

In [18]:
job_title = "Data Scientist"
company = "Google"
retrieved_letter = retrieve_cover_letter(job_title, company)

print(f"Retrieved Cover Letters:\n{retrieved_letter}")

Retrieved Cover Letters:
I am extremely interested in the Data Scientist position at Google. My experience in data analysis and machine learning at Amazon and Facebook has given me a robust understanding of the field. I am proficient in Python, R, and SQL, and have a strong background in data visualization. I believe my qualifications make me an excellent fit for this position.


In [19]:
from datasets import Dataset
from transformers import AutoTokenizer

In [20]:
model_google = "google/flan-t5-large"

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_google)
model = AutoModelForSeq2SeqLM.from_pretrained(model_google)
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [22]:
train_data = [
    {
        "input_text": f"Generate a cover letter for a {entry['Job Title']} position at {entry['Hiring Company']} based on my {entry['Skillsets']} with {entry['Past Working Experience']} and {entry['Current Working Experience']}",
        "target_text": entry["Cover Letter"]
    }
    for entry in dataset
]

In [23]:
dataset = Dataset.from_dict({
    "input_text" : [entry["input_text"] for entry in train_data],
    "target_text" : [entry["target_text"] for entry in train_data]
})

In [24]:
train_dataset, eval_dataset = train_test_split(dataset, train_size=0.9, random_state=42)

TypeError: '<' not supported between instances of 'int' and 'ellipsis'

In [None]:
train_dataset = Dataset.from_list(train_dataset)
eval_dataset = Dataset.from_list(eval_dataset)

In [None]:
train_dataset = train_dataset.map(tokenize_data, batched=True)

In [None]:
train_dataset

In [None]:
def tokenize_data(example):
    return tokenizer(example["input_text"], padding="max_length", truncation=True, max_length=512)

# Fine Tuning T5 with LoRA and FP16

In [None]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_google, torch_dtype=torch.float16, device_map="auto")

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05
)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
training_args = TrainingArguments(
    output_dir="./flan-t5-tuned",
    eval_strategy='epoch',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_total_limit=2,
    num_train_epochs=3,
    logging_dir="./logs"
)

In [None]:
trainer = Trainer(
    model = model,
    train_dataset=train_dataset,
    args=training_args
)

pipeline = pipeline('text2text-generation', model=model_google)

prompt = '''
Write a professional cover letter for a Data Scientist position at Microsoft.
The applicant has 5 years of experience in Python, AI, and Data Science.
Make it formal, well-structured, and persuasive.
'''

inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, min_length = 400, max_length=1024, temperature=0.7, top_p = 0.95, do_sample = True)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)