In [1]:
import numpy as np
import pandas as pd
import tiktoken
import openai
import PyPDF2

import nltk

In [2]:
import os

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-3.5-turbo")


In [4]:
nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
def trim(text, n_start, n_end):
    sentences = nltk.sent_tokenize(text)

    trimmed_sentences = sentences[n_start:n_end]

    trimmed_sentence = " ".join(trimmed_sentences)

    return trimmed_sentence

In [6]:
dfs = []

pdf_files = [
    ["Completion Report", r"C:\Users\acer\OneDrive - Dicelytics Pvt. Ltd\Desktop\EDV\genai\petrorag\data\COMPLETION_REPORT.PDF"],
    ["Drilling Report", r"C:\Users\acer\OneDrive - Dicelytics Pvt. Ltd\Desktop\EDV\genai\petrorag\data\DRILLING_REPORT.PDF"]
    ]

interval = 10

for f in pdf_files:
    company_name = f[0]
    pdf_file = open(f[1],"rb")

    pdf_reader = PyPDF2.PdfReader(pdf_file)

    text = ""

    for page in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page].extract_text()

    training_data = text.replace("\n", " ")

    sentences = nltk.sent_tokenize(training_data)

    for l in range(0,len(sentences),interval):
        trimmed_sentence = trim(training_data,l,l+interval)

        dfs.append(pd.DataFrame({"Article_ID":[f"{company_name}_{l}"],"Text":[trimmed_sentence]}))


df = pd.concat(dfs,ignore_index = True)


In [7]:
df 

Unnamed: 0,Article_ID,Text
0,Completion Report_0,FINAL WELL REPORT Drilling Licence no...
1,Completion Report_10,An investigation was initiated by Mærsk. To b...
2,Completion Report_20,The area port of the gangway where the beam la...
3,Completion Report_30,Mærsk synergi # 333477 1059208 4 Synergi 105...
4,Completion Report_40,"When running in hole with 9 5/8"" casing it wa..."
...,...,...
156,Drilling Report_650,no Volve F-15 & F-15A Page 52 of 66 B.1.2 ...
157,Drilling Report_660,No. 0 Well: 15/9-F-15 & F-15A Date 16.03.2...
158,Drilling Report_670,0 Well: 15/9-F-15 & F-15A Date 16.03.2009 ...
159,Drilling Report_680,no Volve F-15 & F-15A Page 62 of 66 C.1.3 ...


In [8]:
EMBEDDING_MODEL = "text-embedding-ada-002"

In [9]:
def get_embedding(text: str, model: str = EMBEDDING_MODEL) -> list[float]:
    client = openai.OpenAI()  # Create client instance
    response = client.embeddings.create(
        model=model,
        input=text
    )
    
    return response.data[0].embedding

In [10]:
def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str,str], list[float]]:

    return {

        idx :get_embedding(r.Text) for idx,r in df.iterrows()


    }

In [11]:
def load_embeddings(df):

    max_dim = max([int(c) for c in df.columns if c!= "title" and c!= "heading"])

    return {

        (r.title,r.heading) : [r[str(i)] for i in range(max_dim+1)] for _, r in df.iterrows()


    }

In [12]:
df

Unnamed: 0,Article_ID,Text
0,Completion Report_0,FINAL WELL REPORT Drilling Licence no...
1,Completion Report_10,An investigation was initiated by Mærsk. To b...
2,Completion Report_20,The area port of the gangway where the beam la...
3,Completion Report_30,Mærsk synergi # 333477 1059208 4 Synergi 105...
4,Completion Report_40,"When running in hole with 9 5/8"" casing it wa..."
...,...,...
156,Drilling Report_650,no Volve F-15 & F-15A Page 52 of 66 B.1.2 ...
157,Drilling Report_660,No. 0 Well: 15/9-F-15 & F-15A Date 16.03.2...
158,Drilling Report_670,0 Well: 15/9-F-15 & F-15A Date 16.03.2009 ...
159,Drilling Report_680,no Volve F-15 & F-15A Page 62 of 66 C.1.3 ...


In [13]:
embed_df = compute_doc_embeddings(df)

In [18]:
len(embed_df)

161

In [15]:
embed_df[0]

[-0.028663260862231255,
 -0.007484768517315388,
 -0.008087235502898693,
 -0.01817324385046959,
 -0.009951340034604073,
 0.024523956701159477,
 -0.02622503973543644,
 -0.004458256997168064,
 -0.007938390597701073,
 -0.024424726143479347,
 -0.00983793381601572,
 0.018853677436709404,
 -0.022624412551522255,
 0.02652272954583168,
 -0.008831460028886795,
 -0.008788932114839554,
 0.009235466830432415,
 -0.009277993813157082,
 0.021731344982981682,
 -0.009015743620693684,
 -0.0032267430797219276,
 0.009150412864983082,
 -0.008796020410954952,
 -0.005911265965551138,
 -0.002925509586930275,
 0.0027217338792979717,
 -0.004376746714115143,
 -0.019250597804784775,
 -0.025955703109502792,
 0.003109793644398451,
 -0.003193075768649578,
 -0.010433313436806202,
 -0.018598515540361404,
 0.012736864387989044,
 -0.020469706505537033,
 -0.016982486471533775,
 -0.0015699585201218724,
 -0.0161035917699337,
 0.010404962114989758,
 0.005018197000026703,
 0.008158113807439804,
 -0.002900702180340886,
 -0.006

In [16]:
def vector_similarity(x:list[float],y:list[float]) -> float:
    return np.dot(np.array(x),np.array(y))

def order_document_sections_by_query_similarity(query:str,contexts: dict[(str,str), np.array]) -> list:

    query_embedding = get_embedding(query)

    document_similarities = sorted([(vector_similarity(query_embedding,doc_embedding),doc_index) for doc_index, doc_embedding in contexts.items()],reverse= True)

    return document_similarities


In [17]:
order_document_sections_by_query_similarity("What is setting depth for 13 3/8 casing shoe",embed_df)

[(0.8291483437069274, 142),
 (0.8197869653998489, 22),
 (0.8185175656523691, 120),
 (0.8175166395654534, 35),
 (0.8121866910667719, 135),
 (0.8116204907729239, 124),
 (0.8114649901965629, 143),
 (0.8085545101528949, 24),
 (0.8030519123499748, 49),
 (0.8004644855632721, 50),
 (0.7998647190738831, 65),
 (0.7990892116116526, 119),
 (0.7981845055687543, 109),
 (0.7979468234321324, 60),
 (0.7977435259221749, 150),
 (0.7976949598605586, 123),
 (0.7950865389047075, 133),
 (0.7911586728050959, 18),
 (0.7903146305386712, 125),
 (0.7899388450593239, 15),
 (0.7894684475405414, 58),
 (0.7877345398796527, 46),
 (0.7876205956365288, 146),
 (0.7872709913469601, 20),
 (0.7868188573735575, 34),
 (0.786704807816245, 108),
 (0.7862093792832024, 71),
 (0.7854925037304006, 17),
 (0.785138683403251, 66),
 (0.7832867893542754, 39),
 (0.7828784051205167, 134),
 (0.7828294993463026, 33),
 (0.7827674900065159, 145),
 (0.7822303673373114, 26),
 (0.7812733902738911, 31),
 (0.7812663662612278, 14),
 (0.78033536487

In [19]:
df.loc[22]["Text"]

'13 3/8" casing was run, set and cemented at 2562 m MD. The casing was pressure tested on bump to 345 bar with 1.43 sg  Enviromul OBM. BIT RUNS   17 ½" Reed-Hycalog, Bit Type: RSR616M. 8 X 15 TFA: 1,381  Bit Grading:    3 2 BT A X IN LT TD    Drilled cement for ~7 hrs from 1048 m to 1083 m and additional 17 hrs circulating while repairing PRS during trip in hole. DRILLING FLUID   The section was drilled with 1.43 sg Enviromul OBM. Baracarb and Steelseal were added to prevent losses in the Grid Fm. An LCM pill was spotted from TD to abov e Grid formation before POOH. The reci py was: Approx 40 kg/ m3 Baracarb 50, 60  kg/m3 Baracarb 150, 50 kg/m3 Steelseal 400, 13 kg/m3 Steelseal 1000 and 5 kg/m3 Baro fibre C. The pill had minor effect as  this seepage loss has been observed on all the oil producers in the 17 ½” sections. CASING   The 13 3/8" casing was run down to the 20" csg shoe with BX elevator and bails and from then on using the Tesco tool. Several tight spots were encountered on t

In [20]:
def contruct_prompt(question:str, context_embeddings:dict, df: pd.DataFrame) -> str:

    most_relevant_document_section = order_document_sections_by_query_similarity(question,context_embeddings)

    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []

    for _,section_index in most_relevant_document_section[:5]:

        document_section = df.loc[section_index]

        chosen_sections_len += 100


        chosen_sections.append("\n* " + document_section.Text.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))

    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't Know."\n\nContext:\n"""

    return header +"".join(chosen_sections) + "\n\n Q: " + question + "\n A:"



In [73]:
s = contruct_prompt("What is setting depth for 13 3/8 casing shoe",embed_df,df)

In [74]:
print(s)

Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't Know."

Context:

* • Circulate bottoms up at the 13 3/8” casing shoe. :   Design Criteria   Block Weight (T)  69 Tonnes  10 ¾"  1654m 60.7# 13Cr80  9 ⅝" 1166m 53.5# 13  Cr80  Optimum/Maximum MU torque 10 ¾”  23 150 ftLbs  Optimum/Maximum MU torque 9 5/8” VT HC NA  23 150 ftLbs  Optimum/Maximum MU torque 9 5/8” VT  23 150 ftLbs  Running string  5 ½"  57.4# HWDP  Mud  1.30sg OBM  Friction Factor in Casing  0,15  Friction Factor in Open Hole  0,20    Calculated P/U & S/O (Wellplan)  Shoe depth  500 1000 1500 2000 2500 2966  PU weight (T)  103 136 172 210 246 280  SO Weight (T)  102 134 168 203 235 262  5.12.12 Centralisers  Ref. Well schematic. Appendix C.1.3     Centralisers will be fitted onshore. 5.12.13 Pip Tags & Pup joints  None  5.12.14 Casing Cementing  In order to ensure a good quality cement job, it is  recommended to displace base  oi

In [21]:
s = contruct_prompt("what is the maximum depth reached",embed_df,df)

In [22]:
print(s)

Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't Know."

Context:

* Worked string past obstructions with flow of 3000 lpm and rotation of 15-50 rpm. Worked area from 1385 m  MD to 1357 m MD. Observed peaks of 20 kNm at 1370 m MD and 1381 m MD. Stopped rotation and circulation  and pulled from 1385 m MD to 1357 m MD with no overpull. Racked back BHA in derrick due to high winds. Jetted BOP and wellhead. Retrieved nom inal seat protector. Rigged up casing running equipment. Ran 13 3/8"  casing to 40 m MD. Checked float.
* The well was topped up with .140 sg  mud prior to pull out of hole. CASING   Ran 20" casing to setting depth of 1357 m MD fil ling it with seawater as it was run in hole. During running of casing several thight spots were encountered . At 1052 m MD 5 mT weight wa s taken and the FAC tool was  engaged. Washed down to 1055 m MD and con tinued running in hole to 1218 m MD. Whil

In [23]:
from openai import OpenAI


In [24]:
def answer_query_with_context(
    
    query:str,
    df:pd.DataFrame,
    document_embeddings:dict[tuple[str,str],np.ndarray],
    ) ->str:

    prompt = contruct_prompt(query,document_embeddings,df) 

    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
           {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
        ]
    )

    return completion.choices[0].message.content

In [25]:
answer_query_with_context("What is setting depth for 13 3/8 casing shoe",df,embed_df)

'The setting depth for the 13 3/8" casing shoe is 2562 meters MD.'

In [28]:
answer_query_with_context("what is the final maximum depth of the well",df,embed_df)

"I don't know."