In [2]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from pinecone import Pinecone, ServerlessSpec
import pdfplumber

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
data = []

# Open the PDF file
with pdfplumber.open("271_AI Lect Notes.pdf") as pdf:
    for page in pdf.pages:
        # Extract text from each page
        text = page.extract_text()
        if text:  # Check if text is not None
            rows = text.split('\n')
            for row in rows:
                data.append([row])  # Store as a single-item list to create a DataFrame later

# Create a DataFrame
df = pd.DataFrame(data, columns=["text"]) 

In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")



In [7]:
tokenized_data = []
for text in df["text"]:
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors="pt",
        truncation=True
    )
    tokenized_data.append(inputs)


In [8]:
embeddings = []
for inputs in tokenized_data:
    outputs = model(**inputs)  # Use ** to unpack the dictionary
    embeddings.append(outputs.last_hidden_state[:, 0, :].detach().numpy())  # Detach and convert to numpy


In [9]:
api_key = "ef1ce0e6-971d-4119-abae-8068f14edfae"  # Replace with your Pinecone API key
pc = Pinecone(api_key=api_key)

In [13]:
index_name = "genai"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,  
        metric='euclidean',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-west-2'
        )
    )
index = pc.Index(index_name)


In [16]:
# Set batch size and request timeout
batch_size = 100  # Adjust batch size
timeout = 60  # Adjust timeout value

# Add embeddings to Pinecone index in batches
for i in range(0, len(embeddings), batch_size):
    batch = embeddings[i:i+batch_size]
    ids = [str(idx) for idx in range(i, i + len(batch))]
    try:
        index.upsert([{"id": id, "values": emb.flatten().tolist()} for id, emb in zip(ids, batch)], _request_timeout=timeout)
        print(f"Successfully uploaded batch {i//batch_size + 1}")
    except Exception as e:
        print(f"Error uploading batch {i//batch_size + 1}: {e}")


Successfully uploaded batch 1
Successfully uploaded batch 2
Successfully uploaded batch 3
Successfully uploaded batch 4
Successfully uploaded batch 5
Successfully uploaded batch 6
Successfully uploaded batch 7
Successfully uploaded batch 8
Successfully uploaded batch 9
Successfully uploaded batch 10
Successfully uploaded batch 11
Successfully uploaded batch 12
Successfully uploaded batch 13
Successfully uploaded batch 14
Successfully uploaded batch 15
Successfully uploaded batch 16
Successfully uploaded batch 17
Successfully uploaded batch 18
Successfully uploaded batch 19
Successfully uploaded batch 20
Successfully uploaded batch 21
Successfully uploaded batch 22
Successfully uploaded batch 23
Successfully uploaded batch 24
Successfully uploaded batch 25
Successfully uploaded batch 26
Successfully uploaded batch 27
Successfully uploaded batch 28
Successfully uploaded batch 29
Successfully uploaded batch 30
Successfully uploaded batch 31
Successfully uploaded batch 32
Successfully uplo

In [15]:
model_save_path = "C:/Users/impex/Desktop/w13/model.pth"
torch.save(model.state_dict(), model_save_path)
tokenizer.save_pretrained("tokenizer")
print("Model and tokenizer saved successfully.")

Model and tokenizer saved successfully.
