In [3]:
import sqlite3
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [4]:
!ls -al

total 11264
drwxrwxrwx 1 soumen soumen    4096 May 23 12:40 .
drwxrwxrwx 1 soumen soumen    4096 May 23 11:44 ..
-rwxrwxrwx 1 soumen soumen      55 May  9 13:41 .gitignore
drwxrwxrwx 1 soumen soumen    4096 May 23 11:56 .ipynb_checkpoints
-rwxrwxrwx 1 soumen soumen     675 May 15 22:37 README.md
-rwxrwxrwx 1 soumen soumen   49172 May 22 17:28 bulk_upload_soumen_2.csv
-rwxrwxrwx 1 soumen soumen  114691 May 22 17:53 caption.csv
-rwxrwxrwx 1 soumen soumen  108253 May 23 11:43 caption_analysis.ipynb
-rwxrwxrwx 1 soumen soumen    7709 May 18 12:50 caption_soumen.csv
-rwxrwxrwx 1 soumen soumen   12010 May 17 23:03 caption_soumen.xlsx
-rwxrwxrwx 1 soumen soumen    7021 May 18 13:54 caption_soumen_clean.csv
-rwxrwxrwx 1 soumen soumen    7111 May 22 17:57 copy_files.py
-rwxrwxrwx 1 soumen soumen   20808 May 18 13:31 data_images_500.csv
-rwxrwxrwx 1 soumen soumen   98942 May 17 16:18 data_images_all.csv
-rwxrwxrwx 1 soumen soumen   41772 May 18 15:20 dataset_easy_lookup-backup.csv

In [5]:
# load the caption form sqlite
dbpath = 'tip_gai_20230522_2136.db'
conn = sqlite3.connect(dbpath)
df = pd.read_sql_query("SELECT caption FROM caption", conn)

In [6]:
# pre-cleaning the captions
df['caption'] = df['caption'].apply(lambda x: x.strip().strip(".").strip())

In [7]:
import re
def clean(text):
    # remove punctuation, space, linefeed etc.
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(r'[ \n]+', ' ', text)
    return text.strip().lower()

def create_fasttext_data(text):
    with open("./temp_text_data.txt", "w", encoding="utf-8") as fp:
        fp.write(clean(text))
    return "./temp_text_data.txt"

In [13]:
# create a blob of unique-cleaned captions
caption_data = ".".join(df.caption.unique())
caption_data[:500]

"Two knives, one placed on top of other, in a bag pack hidden behind some wires and cluttered with other objects.A bag with knives.\nTwo shrap knives in a backpack.Security discovered a concealed knife in the passenger's bag.Two knives are hidden inside a backpack.Two knives are hidden inside a backpack overlapping each other.A bag contains a sharp knife at the bottom with a few electronics items.Two knives are hidden inside a luggage scattered around.Two knives are hidden inside a luggage.A troll"

In [15]:
data_path = create_fasttext_data(caption_data)
data_path

'./temp_text_data.txt'

In [22]:
# train custom vector model
import fasttext
from easydict import EasyDict as edict

cfg = edict()
cfg.dim=1024
cfg.algorithm = "skipgram" # "cbow"
cfg.epoch = 50
cfg.lr = 0.01
model = fasttext.train_unsupervised(data_path, cfg.algorithm, dim=cfg.dim, thread=4, epoch=cfg.epoch, lr=cfg.lr)
print("Done")

Done


In [24]:
model.save("fasttext_{}_{}.bin".format(cfg.algorithm, cfg.dim))

AttributeError: '_FastText' object has no attribute 'save'

In [23]:
print(cfg)
model.get_nearest_neighbors('knife')

{'dim': 1024, 'algorithm': 'skipgram', 'epoch': 50, 'lr': 0.01}


[(0.9999958872795105, 'knifes'),
 (0.9999924302101135, 'several'),
 (0.9999924302101135, 'laptop'),
 (0.9999923706054688, 'containing'),
 (0.9999921917915344, 'highly'),
 (0.9999920129776001, 'umbrella'),
 (0.9999918341636658, 'backpack'),
 (0.9999918341636658, 'toolbox'),
 (0.9999916553497314, 'consist'),
 (0.9999916553497314, 'consists')]

```
{'dim': 300, 'algorithm': 'skipgram', 'epoch': 20, 'lr': 0.05}
[(0.9996108412742615, 'kitchen'),
 (0.9994402527809143, 'two'),
 (0.9993472695350647, 'containing'),
 (0.9990079402923584, 'hidden'),
 (0.998786211013794, 'umbrella'),
 (0.9986904263496399, 'backpack'),
 (0.9983470439910889, 'including'),
 (0.9983364939689636, 'hand'),
 (0.9982928037643433, 'toolbox'),
 (0.9982638955116272, 'hiding')]
 ```

In [33]:
# pretrained model
fasttext_model = "/mnt/c/Users/dndlssardar/Downloads/Fasttext/cc.en.300.bin"
pre_model = fasttext.load_model(fasttext_model)
pre_model.get_nearest_neighbors('knife')



[(0.8358478546142578, 'knive'),
 (0.812025249004364, 'knives'),
 (0.8031113147735596, 'knife.The'),
 (0.7948669791221619, 'knife.I'),
 (0.7896084189414978, 'knifes'),
 (0.7808818221092224, 'knife.'),
 (0.7503802180290222, 'knife-'),
 (0.738475501537323, 'kinfe'),
 (0.7219622135162354, 'pocketknife'),
 (0.7085967659950256, 'penknife')]