In [59]:
import os
import PyPDF2
import re
import random
import openai
import pandas as pd
from transformers import GPT2TokenizerFast
#from typing import Set
#import numpy as np
#from nltk.tokenize import sent_tokenize
from matplotlib import pyplot as plt

root_dir = os.getcwd()
data_dir = os.path.join(root_dir, 'final_data')
os.makedirs(data_dir, exist_ok=True)

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
def count_tokens(content):
    return len(tokenizer.encode(content))

def read_pdf(filename):

    # Define titles and headings
    titles = ['GENERAL', 'OPERATION', 'CYCLE MACHINING TYPES', 'MULTI-PATH LATHE FUNCTIONS (FOR Series 16i/18i/21i ONLY)',
            'TOOL MANAGEMENT FUNCTION (FOR Series 16i/18i/21i ONLY)', 'EXAMPLE OF PROGRAMMING OPERATION', 'APPENDIX', 'SUPPLEMENTAL INSTRUCTIONS']
    headings = [
        ['OVERVIEW OF THIS MANUAL', 'READ AT FIRST', 'ALL-IN-ONE SCREEN', 'SYMBOLS USED', 'NOTES ON CREATING PROGRAMS',
        'MANUAL GUIDE i SIMULATOR FOR THE PERSONAL COMPUTER'], # Heading in 'GENERAL'
        ['OVERVIEW OF THE PROCEDURE', 'MACHINING PROGRAM FORMAT', 'EDITING MACHINING PROGRAMS', 'EDITING CYCLE MACHINING OPERATIONS',
        'DETAILED DESCRIPTIONS ABOUT ENTERING ARBITRARY FIGURES', 'OPERATIONS IN THE MEM MODE', 'OPERATIONS IN THE MDI MODE',
        'OPERATIONS IN THE MANUAL MODE (HANDLE AND JOG)', 'MACHINING SIMULATION AND DRAWING DURING MACHINING', 'SETTING DATA',
        'BACKGROUND EDITING', 'NC PROGRAM CONVERSION FUNCTION', 'TOOL DATA BASE FUNCTION', 'EDITING OF FREE FIGURE AND FIXED FORM FIGURE OF SUBPROGRAM FORM',
        'SHORTCUT KEY OPERATIONS', 'HELP SCREEN', 'MEMORY CARD INPUT/OUTPUT FUNCTION', 'HANDLING LARGE PROGRAMS', 'CALCULATOR FUNCTION',
        'AUTOMATIC SETTING OF INITIAL VALUE DATA', 'SUPPORT FOR FOLDER MANAGEMENT (FOR Series 30i ONLY)', 'SCREEN HARD COPY',
        'DISPLAYING MACHINING TIME (FOR Series 16i/18i/21i ONLY)', 'PROGRAM COORDINATE SYSTEM CHANING FUNCTION AND TOOL OFFSET MEMORY CHANGING FUNCTION',], # Heading in 'OPERATION'
        ['MILLING', 'TURNING', 'SLANT FACE MACHINING (COORDINATE CONVERSION)'],# Heading in 'CYCLE MACHINING TYPES '
        ['MULTI-PATH LATHE APPLICATION', 'SIMULTANEOUS ALL PATH DISPLAY / EDITING FUNCTION', 'PROCESS LIST EDITING FUNCTION'], # 'Heading in MULTI-PATH LATHE FUNCTIONS (FOR Series 16i/18i/21i ONLY)'
        ['ASSOCIATING TOOL NUMBERS WITH OFFSET NUMBERS', 'VIEWING AND SETTING TOOL OFFSET VALUES', 'VIEWING AND SETTING TOOL MANAGEMENT DATA',
        'VIEWING AND SETTING LIFE MANAGEMENT DATA', 'TOOL LIFE DATA LIST SCREEN', 'MODAL DISPLAY OF OFFSET TYPES',
        'DISPLAY TOOL MANAGEMENT DATA OF CNC STANDARD SCREEN', 'OTHERS'], # Heading in 'TOOL MANAGEMENT FUNCTION (FOR Series 16i/18i/21i ONLY)'
        ['EXPLANATORY NOTES', 'LATHE', 'MACHINING CENTER'], # Heading in 'EXAMPLE OF PROGRAMMING OPERATION'
        ['PARAMETERS', 'ALARMS', 'MANUAL GUIDE i SETUP METHOD'], # Heading in 'APPENDIX'
        ['OUTLINE', 'DISPLAY ATTRIBUTE OF BINARY FILE (ONLY FS30i)', 'ALTER THE CURSOR POSITION AFTER THE COPY',
        'MAXIMUM CHARACTERS IN A BLOCK', 'SPECIFY SIDE FINISH FEEDRATE OF FREE FIGURE AT MILLING', 'SPECIFY SEMI FINISH FEED RATE OF FREE FIGURE FOR TURNING (ZX PLANE)',
        'SPECIFY A GRIDING AMOUNT OF FREE FIGURE FOR TURNING (ZX PLANE)', 'COMBINING TOOL DATABASE AND TOOL MANAGEMENT FUNCTION',
        'SIMULTANEOUS DELETE OF MULTIPLE PROGRAMS', 'INVALIDATE OF SHORTCUT FUNCTION', 'SHORTCUT FUNCTION TO TOOL DATABASE SCREEN',
        'DISPLAY TOOL ICON FUNCTION', 'MACHINING TIME DISPLAY FUNCTION FOR Series 30i', 'IMPROVEMENT OF INPUT ITEM FOR MILLING',
        'IMPROVEMENT OF DISPLAYING POSITION OF INPUT ITEMS [SIDE FINISH AMOUNT] AND [BOTTOM FINISH AMOUNT]', 'IMPROVEMENT OF THE INPUT ITEM FOR DRILLING',
        'IMPROVEMENT OF INPUT ITEM FOR TAPPING', 'POLYGON FIXED FIGURE', 'REUSE OF BLANK FORM DATA AT MACHINING FIGURE ENTRY',
        'CHANGING INPUT SCREEN OF THICKNESS ITEM', 'TURNING GROOIVNG BY VERSATILE TOOL', 'LINEAR GROOVE FOR XY PLANE',
        'FINE BORING FOR TURNING', 'MILLING MACHINING OF WORKPIECE ROTATING ROUND Y-AXIS', 'COUNTER TAPPING CYCLE',
        'A WARNING MESSAGE WHEN A RESIDUAL CUTTING PART REMAINS', 'SIMULTANEOUS FACING AT TURNING CYCLE', 'TOOL MANAGEMENT FUNCTION FOR Series 30i',
        'PARAMETERS'] # Heading in 'SUPPLEMENTAL INSTRUCTIONS'
        ]

    # create a dictionary contains the number of pages in each heading
    def create_page_dict(df):
        pages = dict()

        for i in range(len(df) - 1):
            difference = df.iloc[i+1, 2] - df.iloc[i, 2]
            pages[df.iloc[i, 1]] = difference

        return pages

    # read one pdf page
    def read_pdf_page(page_num):

        headings_1d = []
        for x in headings:
            for y in x:
                headings_1d.append(y)

        with open(filename, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            page = reader.pages[page_num]
            text = page.extract_text()

        text = text.replace("\n", " ")
        text = text.replace("B-63874EN/05", "") # Remove 'B-63874EN/05'

        # TABLE OF CONTENTS
        text = re.sub(r"\.{2,}\s*\d*", " ", text) # Remove continuous '.', and page numbers
        text = re.sub(r"c-\d+", " ", text) # Remove page numbers ('c-number')

        # GENERAL
        text = re.sub(r"\-\s\d+\s\-", "", text) # Remove page numbers ('- number -')
        pattern = r'\b(?:{}|{})\b|\d+\s(?:{}|{})|\d+\.(?:{}|{})'.format(
            '|'.join(re.escape(title) for title in titles),
            '|'.join(re.escape(heading) for heading in headings_1d),
            '|'.join(re.escape(title) for title in titles),
            '|'.join(re.escape(heading) for heading in headings_1d),
            '|'.join(re.escape(title) for title in titles),
            '|'.join(re.escape(heading) for heading in headings_1d)
        )
        text = re.sub(pattern, " ", text) # Remove titles and headings
        text = re.sub(r"’", "\'", text) # Replace ’ to '

        text = re.sub(r"\s+", " ", text) # Remove continuous spaces
        text = re.sub(r"^\s+", "", text) # Remove space (start)
        text = re.sub(r"\s+$", "", text) # Remove space (last)

        text = re.sub(r'•\s+\w+', r'\n\g<0>', text) # Add '\n' before bullets (•)
        text = re.sub(r'-\s+\w+', r'\n\g<0>', text) # Add '\n' before bullets (-)

        return text

    page_filename = 'manual_pages.csv'
    page_filepath = os.path.join(data_dir, page_filename)
    df = pd.read_csv(page_filepath)
    pages = create_page_dict(df)

    pdf = pd.DataFrame()

    title_index = 0
    for heading in pages.keys():

        if heading[:3] == "END":
            title_index += 1
            continue

        title = df.loc[title_index, 'title']
        start_page_number = int(df.loc[title_index, 'pages'])
        title_index += 1
        chapter_text = ""

        count = 0
        while count < pages[heading]:
            while count_tokens(chapter_text) < 3500:
                page_number = start_page_number + count
                text = read_pdf_page(page_number)
                last_chapter_text = chapter_text[:]
                chapter_text += (" " + text)

                if count_tokens(chapter_text) > 3500:
                    if last_chapter_text == "":
                        #print(page_number, end=' ')
                        count += 1
                        break
                    else:
                        chapter_text = last_chapter_text[:]
                        break

                #print(page_number, end=' ')
                count += 1
                if count >= pages[heading]:
                    break

            chapter_text = re.sub(r"^\s+", "", chapter_text) # Remove space (start)
            new_data = {'title': title, 'heading': heading, 'content': chapter_text}
            new_pdf = pd.DataFrame(new_data, index=[0])
        
            #print(heading)
            #print(chapter_text)
            #print()

            pdf = pd.concat([pdf, new_pdf], ignore_index=True)
            chapter_text = ""

    pdf['tokens'] = pdf.content.apply(count_tokens)

    pdf[['tokens']].hist()
    plt.xlabel('Number of tokens')
    plt.ylabel('Number of data')
    plt.title('Distribution of number of tokens')
    plt.show()
    
    return pdf


def get_questions(context):
    model = "gpt-3.5-turbo"
    query = f"You are going to make some questions that may arise using a machine equipped with FANUC MANUAL GUIDE i. A part of the manual book will be provided. Please write questions based on the context below.\n\nContext: {context}\n\nQuestions:\n1."
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": query}
        ]
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=messages
            )
        answer = response['choices'][0]['message']['content']
        return answer
    except Exception as e:
        print(e)
        return ""

    
def get_answers(row):
    model = "gpt-3.5-turbo"
    query = f"Here are some questions that may arise using a machine equipped with FANUC MANUAL GUIDE i. A part of the manual book will be provided. Please answer to the questions based on the context below.\n\nContext: {row.context}\n\nQuestions:\n{row.questions}\n\nAnswers:\n1."
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": query}
        ]
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=messages
            )
        answer = response['choices'][0]['message']['content']
        return answer
    except Exception as e:
        print(e)
        return ""

def get_warning_num():
    warning_num = [x for x in range(3003,3017)]
    warning_num.append(3025)
    temp = [x for x in range(3030,3067)]
    for elem in temp:
        warning_num.append(elem)
    temp = [x for x in range(3070,3073)]
    for elem in temp:
        warning_num.append(elem)
    temp = [x for x in range(3075,3096)]
    for elem in temp:
        warning_num.append(elem)
    warning_num.append(3098)
    temp = [elem + 500 for elem in warning_num]
    warning_num = warning_num + temp

    return warning_num

def get_cause_action_df(pdf_filepath, start_page, end_page, warning_num):
    ca = pd.DataFrame(columns=["cause", "action"])

    with open(pdf_filepath, 'rb') as file:
        reader = PyPDF2.PdfReader(file)

        for page_num in range(start_page, end_page + 1):

            page = reader.pages[page_num]
            text = page.extract_text()

            text = text.replace("\n", " ")
            text = text.replace("B-63874EN/05", "") # Remove 'B-63874EN/05'

            text = re.sub(r"\.{2,}\s*\d*", " ", text) # Remove continuous '.', and page numbers
            text = re.sub(r"c-\d+", " ", text) # Remove page numbers ('c-number')
            text = re.sub(r"\-\s\d+\s\-", "", text) # Remove page numbers ('- number -')
            text = re.sub(r"\b\d{4}\b", "", text)

            text = re.sub(r"\s+", " ", text) # Remove continuous spaces
            text = re.sub(r"^\s+", "", text) # Remove space (start)
            text = re.sub(r"\s+$", "", text) # Remove space (last)

            new_text = re.sub(r'\bCause\b', r'\nCause\n', text)
            new_text = re.sub(r'\bAction\b', r'\nAction\n', new_text)

            split_text = new_text.split("\n")

            data = []

            for i in range(len(split_text)):
                if split_text[i] == "Cause":
                    cause = split_text[i+1]
                    cause = re.sub(r"^\s+", "", cause) # Remove space (start)
                    cause = re.sub(r"\s+$", "", cause) # Remove space (last)

                elif split_text[i] == "Action":
                    action = split_text[i+1]
                    action = re.sub(r"^\s+", "", action) # Remove space (start)
                    action = re.sub(r"\s+$", "", action) # Remove space (last)

                    data.append((cause.strip(), action.strip()))

            new_ca = pd.DataFrame(data, columns=["cause", "action"])
            ca = pd.concat([ca, new_ca], ignore_index=True)

        ca['alarm_1'] = warning_num[:77]
        ca['alarm_2'] = warning_num[77:]

    return ca

def get_semantic_qa(df):
    exp_list = []
    qa = pd.DataFrame(columns=['question', 'answer'])

    for i in range(len(df)):
        question_list = df[['questions']].values[i][0].split('\n')
        answer_list = df[['answers']].values[i][0].split('\n')

        if len(question_list) != len(answer_list):
            a_i = 0
            while a_i < len(answer_list):
                pattern = str(a_i + 1) + '. '
                if answer_list[a_i][:len(pattern)] != pattern:
                    answer_list[a_i - 1] = answer_list[a_i-1] + '\n' + answer_list[a_i]
                    answer_list.remove(answer_list[a_i])
                    a_i -= 1
                a_i += 1

        qa_len = min(len(question_list), len(answer_list))
        if len(question_list) != len(answer_list):
            exp_list.append(i)

        questions = []
        answers = []

        for qa_index in range(qa_len):
            if qa_index < 9:
                q = question_list[qa_index][3:]
                a = answer_list[qa_index][3:]
            else:
                q = question_list[qa_index][4:]
                a = answer_list[qa_index][4:]
            questions.append(q)
            answers.append(a)

        data = {'question': questions, 'answer': answers}
        new_qa = pd.DataFrame(data)
        qa = pd.concat([qa, new_qa], ignore_index=True)

    return qa

def get_cause_action_qa(ca):
    ca_qa = pd.DataFrame(columns=["question", "answer"])

    q_str_1 = ["I received an alarm says, \"", "I got an alarm says, \"",  "I received a warning says, \"", "I got a warning says, \""]
    q_str_2 = ["\" What should I do about this?","\" How can I turn it off?",  "\" How can I resolve this issue?", "\" What actions can I take to fix it?", "\" What should I check when this warning occurs?"]
    

    for index, row in ca.iterrows():
        for s1 in q_str_1:
            for s2 in q_str_2:
                q = s1 + row['cause'] + s2
                a = row['action']
                data = {'question': q, 'answer': a}
                new_qa = pd.DataFrame(data, index=[0])
                ca_qa = pd.concat([ca_qa, new_qa], ignore_index=True)

    filepath = os.path.join(data_dir, 'cause_action.csv')
    ca_qa.to_csv(filepath, index=False)
    return ca_qa


def get_question_cause_action_qa(ca, warning_num):
    questions_str = "1. What should I do if warning number 3062 is raised?\n2. What action should I take when warning number 3062 occurs?\n3. How can I resolve warning number 3062?\n4. What steps should I follow when warning number 3062 is displayed?\n5. How can I fix warning number 3062?\n6. What should I check if I receive warning number 3062?\n7. What might be the cause of warning number 3062 and how can it be resolved?\n8. What should I do when warning number 3062 appears?\n9. What should I do if warning number 3062 appears?\n10. What could be the cause of warning number 3062 and how can I fix it?\n11. How can I modify the machining program to resolve warning number 3062?\n12. What should I modify in the machining program to address warning number 3062?\n13. What should I do when I receive warning number 3062?\n14. If I encounter warning number 3062, what steps should I take?\n15. What are the recommended actions to handle warning number 3062?\n16. Can you provide guidance on dealing with warning number 3062?\n17. In the event of warning number 3062 popping up, what should I do?\n18. How should I respond to the occurrence of warning number 3062?\n19. Are there specific troubleshooting steps for warning number 3062?\n20. What measures can I take to troubleshoot and resolve warning number 3062?\n21. Is there anything I can do when warning number 3062 occurs?\n22. Are there any common causes for warning number 3062, and how can they be rectified?\n23. What modifications should I consider making to the machining program to address warning number 3062?\n24. What's the recommended course of action when warning number 3062 appears?\n25. When warning number 3062 appears, which parts of the machining program should I adjust?\n26. Could you provide instructions on how to handle warning number 3062 when it appears?\n27. I received an alarm number 3062. What should I do about this?\n28. I got an alarm number 3062. What should I do about this?\n29. I received an alarm number 3062. How can I turn it off?\n30. I got an alarm number 3062. How can I turn it off?\n31. I received a warning number 3062. What should I do about this?\n32. I got a warning number 3062. What should I do about this?\n33. I received a warning number 3062. How can I turn it off?\n34. I got a warning number 3062. How can I turn it off?\n35. What should I do when warning number 3062 is raised?\n36. What should I do when alarm number 3062 is raised?\n37. What should I do if alarm number 3062 is raised?\n38. What action should I take when alarm number 3062 occurs?\n39. What action should I take if warning number 3062 occurs?\n40. What action should I take if alarm number 3062 occurs?\n41. How can I resolve alarm number 3062?\n42. What steps should I follow when alarm number 3062 is displayed?\n43. What steps should I follow if warning number 3062 is displayed?\n44. What steps should I follow if alarm number 3062 is displayed?\n45. How can I fix alarm number 3062?\n46. What should I check if I receive alarm number 3062?\n47. What should I check when I receive warning number 3062?\n48. What should I check when I receive alarm number 3062?\n49. What might be the cause of alarm number 3062 and how can it be resolved?\n50. What should I do when alarm number 3062 appears?\n51. What should I do if alarm number 3062 appears?\n52. What could be the cause of alarm number 3062 and how can I fix it?\n53. What should I do when I receive alarm number 3062?"
    questions = questions_str.split('\n')
    for i in range(len(questions)):
        if i < 9:
            questions[i] = questions[i][3:]
        else:
            questions[i] = questions[i][4:]

    qca_qa = pd.DataFrame()

    for num in warning_num:
        for question in questions:
            q = question.replace('3062', str(num))
            matching_row = ca[(ca['alarm_1'] == num) | (ca['alarm_2'] == num)]
            if not matching_row.empty:
                cause = matching_row['cause'].iloc[0]
                action = matching_row['action'].iloc[0]
            a = cause + ' ' + action
            data = {'question': q, 'answer': a}
            new_qa = pd.DataFrame([data])
            qca_qa = pd.concat([qca_qa, new_qa], ignore_index=True)

    return qca_qa, questions

def get_semantic_val(df):
    exp_list = []
    qa = pd.DataFrame(columns=['question', 'answer'])

    for i in range(len(df)):
        question_list = df[['questions']].values[i][0].split('\n')
        answer_list = df[['answers']].values[i][0].split('\n')

        if len(question_list) != len(answer_list):
            a_i = 0
            while a_i < len(answer_list):
                pattern = str(a_i + 1) + '. '
                if answer_list[a_i][:len(pattern)] != pattern:
                    answer_list[a_i - 1] = answer_list[a_i-1] + '\n' + answer_list[a_i]
                    answer_list.remove(answer_list[a_i])
                    a_i -= 1
                a_i += 1

        qa_len = min(len(question_list), len(answer_list))
        if len(question_list) != len(answer_list):
            exp_list.append(i)

        new_size = int(qa_len/3)

        random_numbers = random.sample(range(0, qa_len), new_size)

        questions = []
        answers = []

        for qa_index in range(qa_len):
            if qa_index < 9:
                q = question_list[qa_index][3:]
                a = answer_list[qa_index][3:]
            else:
                q = question_list[qa_index][4:]
                a = answer_list[qa_index][4:]
            questions.append(q)
            answers.append(a)

        selected_questions = [questions[i] for i in random_numbers]
        selected_answers = [answers[i] for i in random_numbers]
        data = {'question': selected_questions, 'answer': selected_answers}
        new_qa = pd.DataFrame(data)
        qa = pd.concat([qa, new_qa], axis=0)

    return qa

def get_cause_action_val(ca):
    ca_qa = pd.DataFrame(columns=["question", "answer"])

    q_str_1 = ["I received an alarm says, \"", "I got an alarm says, \"",  "I received a warning says, \"", "I got a warning says, \""]
    q_str_2 = ["\" What should I do about this?","\" How can I turn it off?",  "\" How can I resolve this issue?", "\" What actions can I take to fix it?", "\" What should I check when this warning occurs?"]

    for index, row in ca.iterrows():
        random_numbers = random.sample(range(0, 7), 2)
        for random_number in random_numbers:
            s1 = q_str_1[random_number%len(q_str_1)]
            s2 = q_str_2[random_number%len(q_str_2)]

            q = s1 + row['cause'] + s2
            a = row['action']
            data = {'question': q, 'answer': a}
            new_qa = pd.DataFrame([data])
            ca_qa = pd.concat([ca_qa, new_qa], ignore_index=True)

    filepath = os.path.join(data_dir, 'cause_action_val.csv')
    ca_qa.to_csv(filepath, index=False)
    return ca_qa


def get_question_cause_action_val(ca, warning_num, questions):
    qca_qa = pd.DataFrame()
    for num in warning_num:
        random_numbers = random.sample(range(len(questions)), len(questions)//3)
        selected_questions = [questions[i] for i in random_numbers]
        for question in selected_questions:
            q = question.replace('3062', str(num))
            matching_row = ca[(ca['alarm_1'] == num) | (ca['alarm_2'] == num)]
            if not matching_row.empty:
                cause = matching_row['cause'].iloc[0]
                action = matching_row['action'].iloc[0]
                a = cause + ' ' + action
            data = {'question': q, 'answer': a}
            new_qa = pd.DataFrame([data])
            qca_qa = pd.concat([qca_qa, new_qa], ignore_index=True)

    filepath = os.path.join(data_dir, 'question_cause_action_val.csv')
    qca_qa.to_csv(filepath, index=False)
    return qca_qa


def create_semantic_train(pdf):
    print("Generating semantic data...\n")
    semantic_train = pd.DataFrame()
    
    for a in range(5):
        print("Generating semantic qa data #" + str(a+1) + "...")
        pdf['context'] = pdf.title + "\n" + pdf.heading + "\n\n" + pdf.content
        pdf['questions'] = pdf.context.apply(get_questions)
        pdf['questions'] = "1. " + pdf.questions
        pdf['answers']= pdf.apply(get_answers, axis=1)
        pdf['answers'] = "1. " + pdf.answers
        pdf = pdf.dropna().reset_index().drop('index',axis=1)
        semantic = get_semantic_qa(pdf)
        filename = 'semantic_train' + str(a+1) + '.csv'
        filepath = os.path.join(data_dir, filename)
        semantic.to_csv(filepath, index=False)
        semantic_train = pd.concat([semantic_train, semantic], ignore_index=True)
        print(semantic.head())
        print("Successfully generated " + filename + "!")
        print()
    
    filename = 'semantic_train_total.csv'
    filepath = os.path.join(data_dir, filename)
    semantic_train.to_csv(filepath, index=False)
    print(semantic_train.tail())
    print("Successfully generated semantic data!\n")
    return semantic_train

def create_semantic_val(pdf):
    print("Generating semantic data...\n")
    semantic_val = pd.DataFrame()

    for a in range(2):
        print("Generating semantic qa data #" + str(a+1) + "...")
        pdf['context'] = pdf.title + "\n" + pdf.heading + "\n\n" + pdf.content
        pdf['questions'] = pdf.context.apply(get_questions)
        pdf['questions'] = "1. " + pdf.questions
        pdf['answers']= pdf.apply(get_answers, axis=1)
        pdf['answers'] = "1. " + pdf.answers
        pdf = pdf.dropna().reset_index().drop('index',axis=1)
        semantic = get_semantic_qa(pdf)
        filename = 'semantic_val' + str(a+1) + '.csv'
        print("Successfully generated " + filename)
        filepath = os.path.join(data_dir, filename)
        semantic.to_csv(filepath, index=False)
        semantic_val = pd.concat([semantic_val, semantic], ignore_index=True)
        print(semantic.head())
        print()
        
    filename = 'semantic_val_total.csv'
    filepath = os.path.join(data_dir, filename)
    semantic_val.to_csv(filepath, index=False)
    print(semantic_val.tail())
    print("Successfully generated semantic data!\n")
    return semantic_val

def create_operation_train(cause_action_df, warning_num):
    print("Generating operation data...\n" )
    cause_action = get_cause_action_qa(cause_action_df)
    print("Successfully generated cause_action data!\n" )
    question_cause_action, questions = get_question_cause_action_qa(cause_action_df, warning_num)
    print("Successfully generated question_cause_action data!\n" )
    
    operation_train = pd.concat([cause_action, question_cause_action], ignore_index=True)
    operation_train = operation_train.dropna()
    filename = 'operation_train.csv'
    filepath = os.path.join(data_dir, filename)
    operation_train.to_csv(filepath, index=False)
    print("Successfully generated operation data!\n")
    
    return operation_train, questions

def create_operation_val(cause_action_df, warning_num, qca_questions):
    print("Generating operation data...\n" )
    cause_action = get_cause_action_val(cause_action_df)
    print("Successfully generated cause_action data!\n")
    question_cause_action = get_question_cause_action_val(cause_action_df, warning_num, questions)

    print("Successfully generated question_cause_action data!\n" )
          
    operation_val = pd.concat([cause_action, question_cause_action], ignore_index=True)
    operation_val = operation_val.dropna()
    filename = 'operation_val.csv'
    filepath = os.path.join(data_dir, filename)
    operation_val.to_csv(filepath, index=False)
          
    print("Successfully generated operation data!\n")
    return operation_val

In [11]:
pdf_filename = 'Manual-Guide-Milling-and-Turning-Manual.pdf'
pdf_filepath = os.path.join(root_dir, pdf_filename)

OPENAI_API_KEY = pass # Put your key here
openai.api_key = OPENAI_API_KEY

warning_num = get_warning_num()
cause_action_df = get_cause_action_df(pdf_filepath, 842, 846, warning_num)

#pdf = read_pdf(pdf_filepath)
filepath = os.path.join(data_dir, 'manual_pdf.csv')
#pdf.to_csv(filepath, index=False)
pdf = pd.read_csv(filepath)
pdf.head()

Unnamed: 0,title,heading,content,tokens
0,GENERAL,OVERVIEW OF THIS MANUAL,"This manual describes the functions of ""MANUAL...",395
1,GENERAL,READ AT FIRST,"In this chapter, you will find the explanation...",1843
2,GENERAL,ALL-IN-ONE SCREEN,"In MANUAL GUIDE i, basically, only one screen ...",565
3,GENERAL,SYMBOLS USED,"In this manual, the following conventions are ...",155
4,GENERAL,NOTES ON CREATING PROGRAMS,The notes that should be observed when creatin...,1262


In [None]:
operation, questions = create_operation_train(cause_action_df, warning_num)
#semantic = create_semantic_train(pdf)
filename = 'semantic_train_total_total.csv'
filepath = os.path.join(data_dir, filename)
semantic = pd.read_csv(filepath)
train_qa = pd.concat([semantic, operation], ignore_index=True)
train_qa = train_qa.dropna()
filepath = os.path.join(data_dir, 'manual_train.csv')
train_qa.to_csv(filepath, index=False)
print("Successfully created training data!\n\n" )

In [72]:
operation = create_operation_val(cause_action_df, warning_num, questions)
#semantic = create_semantic_val(pdf)
filename = 'semantic_val_total_total.csv'
filepath = os.path.join(data_dir, filename)
semantic = pd.read_csv(filepath)
semantic = semantic.drop(['Unnamed: 0'], axis=1)
val_qa = pd.concat([semantic, operation], ignore_index=True)
val_qa = val_qa.dropna()
filepath = os.path.join(data_dir, 'manual_val.csv')
val_qa.to_csv(filepath, index=False)
print("Successfully created validation data!\n\n" )

Generating operation data...

Successfully generated cause_action data!

Successfully generated question_cause_action data!

Successfully generated operation data!

Successfully created validation data!




In [73]:
semantic

Unnamed: 0,question,answer
0,What does this manual describe?,"This manual describes the functions of ""MANUAL..."
1,"What machine models is ""MANUAL GUIDE i"" compat...","The ""MANUAL GUIDE i"" is compatible with the Se..."
2,How may the specifications and usage of MANUAL...,The specifications and usage of MANUAL GUIDE i...
3,What factors determine the functions of the CN...,The functions of the CNC machine tool system a...
4,Is it possible to cover all possible combinati...,It is impossible to cover all possible combina...
...,...,...
4676,How many pages does the FANUC MANUAL GUIDE i (...,The number of pages in the FANUC MANUAL GUIDE ...
4677,Can you provide a brief description of the too...,The tool management function for Series 30i al...
4678,Is the tool management function available on a...,"No, the tool management function may not be av..."
4679,What is the edition date of the FANUC MANUAL G...,The edition date of the FANUC MANUAL GUIDE i (...


In [74]:
print(len(semantic))

4681


In [75]:
print(len(operation))

2772


In [76]:
print(val_qa)

                                               question  \
0                       What does this manual describe?   
1     What machine models is "MANUAL GUIDE i" compat...   
2     How may the specifications and usage of MANUAL...   
3     What factors determine the functions of the CN...   
4     Is it possible to cover all possible combinati...   
...                                                 ...   
7448  What action should I take if alarm number 3598...   
7449  What should I do when I receive alarm number 3...   
7451  What should I modify in the machining program ...   

                                                 answer  
0     This manual describes the functions of "MANUAL...  
1     The "MANUAL GUIDE i" is compatible with the Se...  
2     The specifications and usage of MANUAL GUIDE i...  
3     The functions of the CNC machine tool system a...  
4     It is impossible to cover all possible combina...  
...                                                 ...  
744

In [57]:
operation = create_operation_val(cause_action_df, warning_num, questions)
operation.head()

Generating operation data...

Successfully generated cause_action data!

Successfully generated question_cause_action data!

Successfully generated operation data!



Unnamed: 0,question,answer
0,"I got a warning says, ""There is no area that c...",Modify the machining program to use a smaller ...
1,"I received a warning says, ""There is no area t...",Modify the machining program to use a smaller ...
2,"I got a warning says, ""There is no area that c...",Modify the machining program to use a smaller ...
3,"I received an alarm says, ""There is no area th...",Modify the machining program to use a smaller ...
4,"I got a warning says, ""There is no area that c...",Modify the machining program to use a smaller ...


In [14]:
print("")




In [43]:
semantic_train_1 = drop_na('semantic_train_total.csv')
semantic_train_2 = drop_na('semantic_train_total_1.csv')
semantic_train_3 = drop_na('semantic_train_total_2.csv')
semantic_train = pd.concat([semantic_train_1, semantic_train_2, semantic_train_3], ignore_index=True)
print(len(semantic_train))
semantic_train.to_csv(os.path.join(data_dir, "semantic_train_total_total.csv"))

operation, questions = create_operation_train(cause_action_df, warning_num)

train_qa = pd.concat([semantic_train, operation], ignore_index=True)
train_qa = train_qa.dropna()
filepath = os.path.join(data_dir, 'manual_train.csv')
train_qa.to_csv(filepath, index=False)
print(len(train_qa))

Before:
1111
After:
1111
Before:
5866
After:
5866
Before:
3485
After:
3485
10462
Generating operation data...

Successfully generated cause_action data!

Successfully generated question_cause_action data!

Successfully generated operation data!

20164


In [44]:
semantic_val_1 = drop_na('semantic_val_total.csv')
semantic_val_2 = drop_na('semantic_val_total_1.csv')
semantic_val = pd.concat([semantic_val_1, semantic_val_2], ignore_index=True)
print(len(semantic_val))
semantic_val.to_csv(os.path.join(data_dir, "semantic_val_total_total.csv"))

operation = create_operation_val(cause_action_df, warning_num, questions)

val_qa = pd.concat([semantic_val, operation], ignore_index=True)
val_qa = val_qa.dropna()
filepath = os.path.join(data_dir, 'manual_val.csv')
val_qa.to_csv(filepath, index=False)
print(len(val_qa))

Before:
1146
After:
1146
Before:
3535
After:
3535
4681
Generating operation data...

Successfully generated cause_action data!

Successfully generated question_cause_action data!

Successfully generated operation data!

7453


In [42]:
def drop_na(filename):
    filepath = os.path.join(data_dir, filename)
    df = pd.read_csv(filepath)
    print("Before:", len(df), sep='\n')
    df_cleaned = df.dropna()
    print("After:", len(df_cleaned), sep='\n')
    df_cleaned.to_csv(filepath, index=False)
    return df_cleaned

filename = 'semantic_val_total.csv'
d1 = drop_na(filename)
d1

Before:
1146
After:
1146


Unnamed: 0,question,answer
0,What does this manual describe?,"This manual describes the functions of ""MANUAL..."
1,"What machine models is ""MANUAL GUIDE i"" compat...","The ""MANUAL GUIDE i"" is compatible with the Se..."
2,How may the specifications and usage of MANUAL...,The specifications and usage of MANUAL GUIDE i...
3,What factors determine the functions of the CN...,The functions of the CNC machine tool system a...
4,Is it possible to cover all possible combinati...,It is impossible to cover all possible combina...
...,...,...
1141,Can you explain how the Tool Management Functi...,The Tool Management Function works by allowing...
1142,Are there any training resources or tutorials ...,FANUC may provide training resources and tutor...
1143,What are the benefits of using the Tool Manage...,The benefits of using the Tool Management Func...
1144,Can I customize the Tool Management Function t...,Depending on the capabilities and customizatio...


In [46]:
df_cleaned.isna().sum()

question    0
answer      0
dtype: int64

In [None]:
print("")

In [45]:
def add_prefix(csv_filename):
    filepath = os.path.join(data_dir, (csv_filename + '.csv'))
    df = pd.read_csv(filepath)
    df['question'] = "I would like to ask a question about MANUAL GUIDE i of FANUC. " + df['question']
    output_filename = csv_filename + '_prefix.csv'
    filepath = os.path.join(data_dir, output_filename)
    df.to_csv(filepath, index=False)
    print(df.loc[1, 'question'])

add_prefix('manual_train')
add_prefix('manual_val')

I would like to ask a question about MANUAL GUIDE i of FANUC. Which CNC machines is the "MANUAL GUIDE i" applicable to?
I would like to ask a question about MANUAL GUIDE i of FANUC. What machine models is "MANUAL GUIDE i" compatible with?


In [29]:
def create_semantic_train_2(pdf):
    print("Generating semantic data...\n")
    semantic_train = pd.DataFrame()
    
    for a in range(3):
        print("Generating semantic qa data #" + str(a+6) + "...")
        pdf['context'] = pdf.title + "\n" + pdf.heading + "\n\n" + pdf.content
        pdf['questions'] = pdf.context.apply(get_questions)
        pdf['questions'] = "1. " + pdf.questions
        pdf['answers']= pdf.apply(get_answers, axis=1)
        pdf['answers'] = "1. " + pdf.answers
        pdf = pdf.dropna().reset_index().drop('index',axis=1)
        semantic = get_semantic_qa(pdf)
        filename = 'semantic_train' + str(a+6) + '.csv'
        filepath = os.path.join(data_dir, filename)
        semantic.to_csv(filepath, index=False)
        semantic_train = pd.concat([semantic_train, semantic], ignore_index=True)
        print(len(semantic_train))
        print("Successfully generated " + filename + "!")
        print()
    
    filename = 'semantic_train_total_2.csv'
    filepath = os.path.join(data_dir, filename)
    semantic_train.to_csv(filepath, index=False)
    print(semantic_train.tail())
    print("Successfully generated semantic data!\n")
    return semantic_train

In [30]:
semantic = create_semantic_train_2(pdf)
len(semantic)

Generating semantic data...

Generating semantic qa data #6...
1185
Successfully generated semantic_train6.csv!

Generating semantic qa data #7...
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
2397
Successfully generated semantic_train7.csv!

Generating semantic qa data #8...
3554
Successfully generated semantic_train8.csv!

                                               question  \
3549  oes the Tool Management Function provide any s...   
3550                                                      
3551  re there any prerequisites or requirements for...   
3552                                                      
3553  Can I customize or configure the Tool Manageme...   

                                                 answer  
3549  he Tool Management Function provides specific ...  
3550                                                     
3551  here may be prerequisites or requirements for ...  
3552                    

3554

In [26]:
print("Merging semantic data...\n")
semantic_train = pd.DataFrame()

for a in range(3):
    filename = 'semantic_train' + str(a+6) + '.csv'
    filepath = os.path.join(data_dir, filename)
    semantic = pd.read_csv(filepath)
    semantic_train = pd.concat([semantic_train, semantic], ignore_index=True)
    print(len(semantic_train))
    print()

filename = 'semantic_train_total_2.csv'
filepath = os.path.join(data_dir, filename)
semantic_train.to_csv(filepath, index=False)
print(semantic_train.tail())
print("Successfully generated semantic data!\n")
len(semantic_train)

Generating semantic data...

1192

2362

3511

4698

5866

                                               question  \
5861  What is the date of the edition of the OPERATO...   
5862  What are the compatible FANUC machine series f...   
5863  Can you provide the name of the function in th...   
5864  Is there a drawing or diagram available for th...   
5865  What is the purpose of the Supplemental Instru...   

                                                 answer  
5861  The date of the edition of the OPERATOR'S MANU...  
5862  The compatible FANUC machine series for the To...  
5863  The name of the function in the FANUC MANUAL G...  
5864  It is not mentioned if there is a drawing or d...  
5865  The purpose of the Supplemental Instructions f...  
Successfully generated semantic data!



5866

In [27]:
duplicate_check = semantic_train['question'].duplicated()

# 중복된 값이 있는지 확인
if duplicate_check.any():
    print("중복된 값이 존재합니다.")
else:
    print("중복된 값이 존재하지 않습니다.")

중복된 값이 존재합니다.


In [28]:
duplicates = semantic_train[semantic_train.duplicated(subset='question', keep=False)]

if not duplicates.empty:
    print("중복된 값이 존재합니다:")
    print(duplicates)
else:
    print("중복된 값이 존재하지 않습니다.")

중복된 값이 존재합니다:
                                               question  \
2     Are there any variations in the specifications...   
4     What factors determine the functions of the CN...   
10      What is MANUAL GUIDE i and what is its purpose?   
12    How is MANUAL GUIDE i installed on a CNC machine?   
13    How can a part program be created using MANUAL...   
...                                                 ...   
5787  What is the purpose of adding a linear groove ...   
5810  What are the parameters related to Spindle sto...   
5816  Is machining simulation available for workpiec...   
5852  How is the face removal amount calculated when...   
5856  What is the purpose of the Tool Management Fun...   

                                                 answer  
2     Yes, the specifications and usage of MANUAL GU...  
4     The functions of the CNC machine tool system a...  
10    MANUAL GUIDE i is an operation guidance system...  
12    MANUAL GUIDE i is typically installed i

In [31]:
def create_semantic_val_2(pdf):
    print("Generating semantic data...\n")
    semantic_val = pd.DataFrame()
    
    for a in range(3):
        print("Generating semantic qa data #" + str(a+1) + "...")
        pdf['context'] = pdf.title + "\n" + pdf.heading + "\n\n" + pdf.content
        pdf['questions'] = pdf.context.apply(get_questions)
        pdf['questions'] = "1. " + pdf.questions
        pdf['answers']= pdf.apply(get_answers, axis=1)
        pdf['answers'] = "1. " + pdf.answers
        pdf = pdf.dropna().reset_index().drop('index',axis=1)
        semantic = get_semantic_qa(pdf)
        filename = 'semantic_val' + str(a+6) + '.csv'
        filepath = os.path.join(data_dir, filename)
        semantic.to_csv(filepath, index=False)
        semantic_val = pd.concat([semantic_val, semantic], ignore_index=True)
        print(len(semantic_val))
        print("Successfully generated " + filename + "!")
        print()
    
    filename = 'semantic_val_total_1.csv'
    filepath = os.path.join(data_dir, filename)
    semantic_val.to_csv(filepath, index=False)
    print(semantic_val.tail())
    print("Successfully generated semantic data!\n")
    return semantic_val

In [32]:
semantic = create_semantic_val_2(pdf)
len(semantic)

Generating semantic data...

Generating semantic qa data #1...
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
1156
Successfully generated semantic_val6.csv!

Generating semantic qa data #2...
2354
Successfully generated semantic_val7.csv!

Generating semantic qa data #3...
Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
3559
Successfully generated semantic_val8.csv!

                                               question  \
3554  How many pages does the FANUC MANUAL GUIDE i (...   
3555  Can you provide a brief description of the too...   
3556  Is the tool management function available on a...   
3557  What is the edition date of the FANUC MANUAL G...   
3558  Is there a specific drawing or diagram availab...   

                                              

3559

In [50]:
import pandas as pd
import random

# 원본 CSV 파일 경로
input_csv_file = os.path.join(data_dir, 'cause_action_val.csv')

# 랜덤으로 선택할 행의 수
num_rows_to_select = 15

# 원본 CSV 파일을 pandas DataFrame으로 읽어옵니다.
df = pd.read_csv(input_csv_file)

# 랜덤으로 50개의 행을 선택합니다.
random_selected_rows = random.sample(range(len(df)), num_rows_to_select)

# 선택된 행으로 새로운 DataFrame을 생성합니다.
selected_df = df.iloc[random_selected_rows]

# 결과를 저장할 CSV 파일 경로
output_csv_file = os.path.join(data_dir, 'sample_ca.csv')

# 선택된 행을 CSV 파일로 저장합니다.
selected_df.to_csv(output_csv_file, index=False)