In [1]:
import pandas as pd
import sqlite3

In [2]:
df = pd.read_csv('./dataset_500_to_2300_map_final.csv')

In [3]:
df.shape

(2381, 5)

In [4]:
conn = sqlite3.connect("tip_gai_2500.db")
conn2 = sqlite3.connect("tip_gai_2500_2.db")

dbdf = pd.read_sql_query("SELECT * FROM caption", conn)
dbdf2 = pd.read_sql_query("SELECT * FROM caption", conn2)

In [5]:
dbdf.shape, dbdf2.shape

((2500, 6), (2500, 6))

In [6]:
df.head()

Unnamed: 0,Thumbnail,File Name,NA,Is in 500 collection?,Map File Name
0,,P00001.jpg,,1,
1,,P00002.jpg,,0,P00001.jpg
2,,P00004.jpg,,1,
3,,P00005.jpg,,1,
4,,P00006.jpg,,1,


In [7]:
df2 = df[(df['NA'] != 1) & (df['Is in 500 collection?'] == 0) & (~df['Map File Name'].isna())]
df2.shape

(1862, 5)

In [8]:
df2.drop(columns=['Thumbnail', 'NA', 'Is in 500 collection?'], inplace=True)
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.drop(columns=['Thumbnail', 'NA', 'Is in 500 collection?'], inplace=True)


Unnamed: 0,File Name,Map File Name
1,P00002.jpg,P00001.jpg
7,P00009.jpg,P00008.jpg
10,P00012.jpg,P00011.jpg
13,P00015.jpg,P00011.jpg
18,P00022.jpg,P00021.jpg


In [9]:
dbdf.head()

Unnamed: 0,id,file_id,caption,user,is_error,is_occluded
0,1,P00001.jpg,"Two knives, one placed on top of other, in a b...",anshul,0,1
1,2,P00001.jpg,A bag with knives.\nTwo shrap knives in a back...,soumen,0,1
2,3,P00001.jpg,Security discovered a concealed knife in the p...,soumen,0,1
3,5,P00004.jpg,Two knives are hidden inside a backpack.,soumen,0,0
4,6,P00005.jpg,Two knives are hidden inside a backpack overla...,soumen,0,0


# SQLite Update

## Insert map-data to database

In [10]:
user = 'mapped'
sql = "INSERT INTO 'caption'  ('file_id', 'caption', 'user', 'is_error', 'is_occluded') VALUES (?, ?, ?, ?, ?);"
dbdf2['user'] = user
cur = conn2.cursor()
cur.execute("BEGIN TRANSACTION;")
for src_file, map_file in df2.apply(lambda x:x.values, axis=1):
    print("\r",src_file, map_file, " ", end="\b")
    fdf = dbdf2[dbdf2['file_id'] == map_file].copy()
    fdf.loc[:, ['file_id']] = src_file
    data = fdf[['file_id', 'caption', 'user', 'is_error', 'is_occluded']].values
    cur.executemany(sql, data)
cur.execute("COMMIT;")
conn2.commit()
dbdf2 = pd.read_sql_query("SELECT * FROM caption", conn2)
dbdf2.shape

 P08923.jpg P02468.jpg 

(11805, 6)

In [11]:
conn2.close()
# conn2.close()

# File Creation with `Ray`

In [9]:
import ray
from collections import defaultdict
import pathlib
import threading

In [66]:
# create locks
file_locks = defaultdict(lambda: threading.Lock())
@ray.remote
def file_writer(filename, captions):
    with file_locks[filename]:
        with open(filename, 'a+') as fp:
            fp.seek(0)
            lines = fp.readlines()
            for caption in captions:
                if caption + "\n" in lines:
                    pass
                elif len(caption.split(" ")) < 2:
                    pass
                else:
                    fp.write(caption.replace("\n", " ").strip() + "\n")

In [67]:
ray.init()
task_counter  = 0

caption_root = pathlib.Path('./sixray_500/train/captions')

for src_file, map_file in df2.apply(lambda x:x.values, axis=1):
    print("\r",src_file, map_file, " ", end="\b")
    # print(dbdf[dbdf['file_id'] == map_file]['caption'].values)
    caption_filename = src_file.replace('.jpg', '.txt')
    print(caption_root / caption_filename)
    file_writer.remote(caption_root / caption_filename, dbdf[dbdf['file_id'] == map_file]['caption'].values)
    task_counter += 1

print("Waiting for Ray...", end="")
ray.wait([], num_returns=task_counter)
print("Shutting-down Ray")
ray.shutdown()
print("Done")

2023-06-07 19:14:53,533	INFO worker.py:1625 -- Started a local Ray instance.


 P00002.jpg P00001.jpg sixray_500\train\captions\P00002.txt
 P00009.jpg P00008.jpg sixray_500\train\captions\P00009.txt
 P00012.jpg P00011.jpg sixray_500\train\captions\P00012.txt
 P00015.jpg P00011.jpg sixray_500\train\captions\P00015.txt
 P00022.jpg P00021.jpg sixray_500\train\captions\P00022.txt
 P00038.jpg P00033.jpg sixray_500\train\captions\P00038.txt
 P00044.jpg P00033.jpg sixray_500\train\captions\P00044.txt
 P00045.jpg P00042.jpg sixray_500\train\captions\P00045.txt
 P00046.jpg P00033.jpg sixray_500\train\captions\P00046.txt
 P00048.jpg P00033.jpg sixray_500\train\captions\P00048.txt
 P00049.jpg P00042.jpg sixray_500\train\captions\P00049.txt
 P00051.jpg P00033.jpg sixray_500\train\captions\P00051.txt
 P00052.jpg P00033.jpg sixray_500\train\captions\P00052.txt
 P00053.jpg P00042.jpg sixray_500\train\captions\P00053.txt
 P00054.jpg P00033.jpg sixray_500\train\captions\P00054.txt
 P00055.jpg P00042.jpg sixray_500\train\captions\P00055.txt
 P00065.jpg P00042.jpg sixray_500\train\