In [1]:
#Create a model from a vision encoder model and a text encoder model

In [1]:
!pip install datasets



In [2]:
!pip install ipyplot

Collecting ipyplot
  Downloading ipyplot-1.1.1-py3-none-any.whl (13 kB)
Collecting shortuuid (from ipyplot)
  Downloading shortuuid-1.0.11-py3-none-any.whl (10 kB)
Installing collected packages: shortuuid, ipyplot
Successfully installed ipyplot-1.1.1 shortuuid-1.0.11


### 处理数据 load 数据

In [5]:
## 将二进制数据先写入到文件系统中 

import os
from PIL import Image
from io import BytesIO
import torch
from torchvision.transforms import ToTensor
# 读取 Arrow 文件并转换为 Pandas DataFrame:
import pyarrow as pa
import pandas as pd

def preprocess(arrow_file, image_directory):
    arrow_table = pa.ipc.RecordBatchFileReader(pa.memory_map(arrow_file, 'r')).read_all()
    df = arrow_table.to_pandas()
    df['caption'] = df['caption'].apply(lambda x: x[0]) ## caption 为 str
    
    df = df[:300]
    
    # 创建一个目录来存储图像
    if not os.path.exists(image_directory):
        os.makedirs(image_directory)
    folder_name = arrow_file.split('/')[-1].split('.')[0]
    folder = os.path.join(image_directory, folder_name)
    #sagemaker
    folder_path_name = os.path.join('/opt/ml/input/data/training', folder_name)
    if not os.path.exists(folder):
        os.makedirs(folder)

    # 写入图像数据到磁盘，并保存图像路径到DataFrame中
    df['image_path'] = None  # 添加一个新列来存储图像路径

    for index, row in df.iterrows():
        image_data = row['image']  # 二进制图像数据
        image_id = row['image_id']  # 图像的唯一标识符
        image_path = os.path.join(folder, image_id)

        with open(image_path, 'wb') as file:    
            file.write(image_data)
        #train locally use folder, sagemaker use folder_path_name
        df.at[index, 'image_path'] = os.path.join(folder_path_name, image_id)

    # 创建一个新的DataFrame来存储图像路径和描述
    images_captions_df = df[['image_path', 'caption']].copy()

    # 保存DataFrame到CSV文件
    csv_file = os.path.join(image_directory, folder_name+'.csv')
    images_captions_df.to_csv(csv_file, index=False)
    print ("<<< process finished for {}!", input_file)
    

In [6]:
input_file = '../data/LUPerson_clip_128w_phrase_arrow/LUPerson_train.arrow'
image_directory = '../images'
preprocess(input_file, image_directory)

input_file = '../data/LUPerson_clip_128w_phrase_arrow/LUPerson_test.arrow'
image_directory = '../images'
preprocess(input_file, image_directory)

input_file = '../data/LUPerson_clip_128w_phrase_arrow/LUPerson_val.arrow'
image_directory = '../images'
preprocess(input_file, image_directory)



<<< process finished for {}! ../data/LUPerson_clip_128w_phrase_arrow/LUPerson_train.arrow
<<< process finished for {}! ../data/LUPerson_clip_128w_phrase_arrow/LUPerson_test.arrow
<<< process finished for {}! ../data/LUPerson_clip_128w_phrase_arrow/LUPerson_val.arrow


# upload to s3

In [7]:

!aws s3 cp ../images s3://sagemaker-us-west-2-726335585155/clip/ --recursive

upload: ../images/LUPerson_test.csv to s3://sagemaker-us-west-2-726335585155/clip/LUPerson_test.csv
upload: ../images/LUPerson_test/01_01_0372_00001055.jpg to s3://sagemaker-us-west-2-726335585155/clip/LUPerson_test/01_01_0372_00001055.jpg
upload: ../images/LUPerson_test/01_02_0176_00000090.jpg to s3://sagemaker-us-west-2-726335585155/clip/LUPerson_test/01_02_0176_00000090.jpg
upload: ../images/LUPerson_test/01_02_0228_00000345.jpg to s3://sagemaker-us-west-2-726335585155/clip/LUPerson_test/01_02_0228_00000345.jpg
upload: ../images/LUPerson_test/00_00_0351_00000024.jpg to s3://sagemaker-us-west-2-726335585155/clip/LUPerson_test/00_00_0351_00000024.jpg
upload: ../images/LUPerson_test/01_02_0265_00000634.jpg to s3://sagemaker-us-west-2-726335585155/clip/LUPerson_test/01_02_0265_00000634.jpg
upload: ../images/LUPerson_test/01_01_0410_00000200.jpg to s3://sagemaker-us-west-2-726335585155/clip/LUPerson_test/01_01_0410_00000200.jpg
upload: ../images/LUPerson_test/01_01_0578_00000889.jpg to s

In [8]:
import boto3
import sagemaker
import os
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [9]:
sess.default_bucket()

'sagemaker-us-west-2-726335585155'

#### VisionTextDualEncoder fine-tuning

#### Create a model from a vision encoder model and a text encoder model

In [None]:
import sagemaker
from sagemaker.huggingface import HuggingFace

training_input_path = 's3://sagemaker-us-west-2-726335585155/clip'

hyperparameters={'output_dir': '/opt/ml/model',
                 'model_name_or_path': 'openai/clip-vit-base-patch16',
                 'train_file':'/opt/ml/input/data/training/LUPerson_train.csv',
                 'validation_file':'/opt/ml/input/data/training/LUPerson_val.csv',
                 'test_file':'/opt/ml/input/data/training/LUPerson_test.csv',
                 'image_column': 'image_path',
                 'caption_column': 'caption',
                 'remove_unused_columns': False,
                 'save_total_limit':3,
                 'num_train_epochs': 1,
                 'do_eval':True,
                 'do_train':True,
                 'freeze_text_model': True,
                 'max_seq_length': 77,
                 'dataloader_drop_last': True,
                 'overwrite_output_dir': True,
                 'weight_decay': 0.1,
                 'warmup_steps': '0',
                 'learning_rate': 5e-5,
                 'per_device_train_batch_size':64,
                 'per_device_eval_batch_size':64,
                 }


#distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}

# create the Estimator
huggingface_estimator = HuggingFace(
      entry_point='run_clip.py', # script
      source_dir='./', # relative path to example
      instance_type='ml.g5.2xlarge',
      instance_count=1,
      transformers_version='4.26.0',
      pytorch_version='1.13.1',
      py_version='py39',
      role=role,
      base_job_name='clip-pretrain',
      hyperparameters = hyperparameters
      # distribution= distribution
)

huggingface_estimator.fit(training_input_path)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: clip-pretrain-2024-01-11-02-31-32-898


2024-01-11 02:31:33 Starting - Starting the training job...
2024-01-11 02:31:49 Starting - Preparing the instances for training......
2024-01-11 02:32:55 Downloading - Downloading input data......
2024-01-11 02:33:38 Downloading - Downloading the training image.......

# local test

In [159]:
## clip model
from transformers import CLIPProcessor, CLIPModel,CLIPVisionModel
model = CLIPModel.from_pretrained("model/clip-vit-base-patch16-finetuned")
processor = CLIPProcessor.from_pretrained("model/clip-vit-base-patch16-finetuned")
vision_model = CLIPVisionModel.from_pretrained("model/clip-vit-base-patch16-finetuned")

In [152]:
import pandas as pd
test_image_df = pd.read_csv('images/LUPerson_test.csv')
test_image = test_image_df['image_path'].to_list()
test_caption = test_image_df['caption'].to_list()
#text=["A middle-aged short hair male who is standing. Wearing black jacket with coloured. Has dark long pants on.",'test']
import torch
import ipyplot
images = [Image.open(url) for url in test_image[0:100]]
with torch.no_grad():
    inputs = processor(
        text=test_caption[0:100], images=images, return_tensors="pt", padding=True
    )
    outputs = model(**inputs)
logits_per_text = outputs.logits_per_text # this is the image-text similarity score
probs = logits_per_text.softmax(dim=0) # we can take the softmax to get the label probabilities

print(logits_per_text.shape, logits_per_text)


torch.Size([100, 100]) tensor([[31.2081, 23.3035, 24.1448,  ..., 15.8613,  5.2755,  9.6467],
        [21.0539, 24.8181, 23.6474,  ..., 11.7418,  7.8775, 11.9312],
        [22.0883, 23.8736, 27.7374,  ..., 14.2900, 14.1927, 13.6320],
        ...,
        [18.4473, 21.1418, 23.8520,  ..., 23.8294, 14.9038, 22.9525],
        [13.6411, 20.1180, 22.6118,  ..., 17.3493, 31.3786, 21.3941],
        [19.6209, 22.1684, 22.6392,  ..., 14.6688, 14.8618, 27.0563]])


In [145]:
values, indices = logits_per_text.squeeze().topk(3)
# 假设 preds 是一个100x100的预测结果张量
# 正确类别标签在一个100维的向量中
preds = indices  # 你的100x100张量
labels = torch.arange(0, 100)  # 假设正确的标签是从0到99的连续整数

# 检查每个样本的预测中是否包含正确的类别标签
correct_predictions = (preds == labels.unsqueeze(1)).any(dim=1)

# 计算准确率
accuracy = correct_predictions.sum().item() / len(labels)

print(f'Accuracy: {accuracy}')

Accuracy: 0.78


In [81]:
len(outputs['image_embeds'][0])

512

In [92]:
len(outputs['vision_model_output']['pooler_output'][0])

768

In [84]:
outputs

CLIPOutput(loss=None, logits_per_image=tensor([[11.5890,  2.8339,  2.3849,  ..., -0.6595, -2.8207,  1.1888],
        [ 3.4156,  4.7572,  2.8731,  ...,  1.1436, -1.4130,  1.1165],
        [ 3.5148,  4.3144,  6.3653,  ...,  4.3751,  4.5354,  1.6987],
        ...,
        [-2.2685, -3.7664, -4.4802,  ...,  4.4641,  0.1583, -3.9238],
        [-4.7729, -2.9080, -0.3997,  ...,  1.0822, 10.2024, -1.7678],
        [-2.8430, -3.1859, -0.9697,  ...,  4.3737,  2.7105,  7.6862]]), logits_per_text=tensor([[11.5890,  3.4156,  3.5148,  ..., -2.2685, -4.7729, -2.8430],
        [ 2.8339,  4.7572,  4.3144,  ..., -3.7664, -2.9080, -3.1859],
        [ 2.3849,  2.8731,  6.3653,  ..., -4.4802, -0.3997, -0.9697],
        ...,
        [-0.6595,  1.1436,  4.3751,  ...,  4.4641,  1.0822,  4.3737],
        [-2.8207, -1.4130,  4.5354,  ...,  0.1583, 10.2024,  2.7105],
        [ 1.1888,  1.1165,  1.6987,  ..., -3.9238, -1.7678,  7.6862]]), text_embeds=tensor([[ 4.3958e-02,  8.4228e-03,  3.1122e-02,  ..., -4.8751e-

In [93]:
# top_images, top_scores = [], []

# for score, index in zip(values, indices):
#     top_images.append(images[int(index.numpy())])
#     score = score.numpy().tolist()
#     top_scores.append(round(score, 3))
    
# print (f"Scores: {top_scores}")
# ipyplot.plot_images(top_images, img_width=300)

In [161]:
inputs = processor(images=images[0:2], return_tensors="pt")
outputs = vision_model(**inputs)
feat = outputs.pooler_output

In [164]:
len(feat[0])

768

In [167]:
type(images[0])

PIL.JpegImagePlugin.JpegImageFile

In [None]:
len(images[0:2]