In [1]:
# utils_vlm_move.py
# Process input command, use multimodal large model for image recognition, control robotic gripper to pick and move objects

# print('Luo Research Lab')

from utils_robot import *
from utils_asr import *
from utils_vlm import *

import time

def vlm_move(PROMPT='Please place the green block on the hand', input_way='keyboard'):
    '''
    Use multimodal large model for image recognition, control the gripper to pick and move objects
    input_way: 'speech' for speech input, 'keyboard' for keyboard input
    '''

    print('Using multimodal large model for image recognition, controlling gripper to pick and move objects')




    print('Resetting the robotic arm')
    pwm_values = [1500, 1500, 1500, 1500, 1610, 1500]  # Specified PWM pulse widths

    for i, pwm in enumerate(pwm_values):
        pca.setServoPulse(i, pwm)  # Set PWM pulse width for the corresponding channel
        print(f'Setting PWM pulse width for channel {i} to {pwm}')

    time.sleep(3)  # Pause for 3 seconds to ensure the robotic arm completes the action

    ## Step 1: Complete hand-eye calibration
    print('Step 1: Complete hand-eye calibration')

    ## Step 2: Issue the command
    # PROMPT_BACKUP = 'Please place the green block on the hand' # Default command
    
    # if input_way == 'keyboard':
    #     PROMPT = input('Step 2: Enter command')
    #     if PROMPT == '':
    #         PROMPT = PROMPT_BACKUP
    # elif input_way == 'speech':
    #     record()  # Start recording
    #     PROMPT = speech_recognition()  # Speech recognition
    print('Step 2, the issued command is:', PROMPT)




    ## Step 3: Capture top-view image
    print('Step 3: Capture top-view image')
    top_view_shot(check=False)

    ## Step 4: Input the image to the multimodal vision model
    print('Step 4: Input the image to the multimodal vision model')
    img_path = 'temp/vl_now.jpg'

    n = 1
    while n < 5:
        try:
            print(f'    Attempting to access the multimodal model, attempt {n}')
            result = yi_vision_api(PROMPT, img_path='temp/vl_now.jpg')
            print('    Multimodal model call successful!')
            print(result)
            break
        except Exception as e:
            print('    Error in data structure returned by the multimodal model, retrying', e)
            n += 1

    ## Step 5: Post-processing and visualization of the model output
    print('Step 5: Post-processing and visualization of the model output')
    START_X_CENTER, START_Y_CENTER, END_X_CENTER, END_Y_CENTER = post_processing_viz(result, img_path, check=True)

    ## Step 6: Convert pixel coordinates to robotic arm coordinates
    print('Step 6: Hand-eye calibration, converting pixel coordinates to robotic arm coordinates')
    # Start point in robotic arm coordinates
    START_X_MC, START_Y_MC = eye2hand(START_X_CENTER, START_Y_CENTER)
    # End point in robotic arm coordinates
    END_X_MC, END_Y_MC = eye2hand(END_X_CENTER, END_Y_CENTER)

    ## Step 7: Use the gripper pump to pick and move the object
    print('Step 7: Using the gripper to pick and move the object')
    pump_move(mc=mc, XY_START=[START_X_MC, START_Y_MC], XY_END=[END_X_MC, END_Y_MC])

    ## Step 8: Cleanup
    print('Step 8: Task completed')
    GPIO.cleanup()            # Release GPIO pin channel
    cv2.destroyAllWindows()   # Close all OpenCV windows
    # exit()


ModuleNotFoundError: No module named 'fcntl'