# Setup

In [None]:
import pyarrow
import json
import random

import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

from pathlib import Path
from sklearn import svm
from sklearn.model_selection import train_test_split

In [None]:
METADATA_PATH = "../data/mock" 
METADATA_FILENAME = "metadata.parquet"
METADATA_FIELDS = ["id", "path_to_image", "label"]
EMBEDDINGS_PATH = "../data/mock" #same only for mock
EMBEDDINGS_FILENAME = "embeddings.parquet"
EMBDDINGS_FIELDS = ["id", "embedings"]
TRAIN_PATH = "../data/mock/train/"
TEST_PATH = "../data/mock/test/"

In [3]:
SYMBOLS_PATH = "symbols.json"

In [40]:
pd.set_option("display.max_columns", None)

# Mock data creating

In [42]:
french_words = [
    "bonjour", "fromage", "Château", "soleil", "Voiture", "papillon",
    "Liberté", "montagne", "Forêt", "rivière", "Étoile", "histoire",
    "Musique", "amour", "Ciel", "fleur", "Espoir", "mer", "Parfum", "Neige",
    "Lumière", "printemps", "Ombre", "temps", "Vent", "orage", "Pluie", "nuage",
    "Oiseau", "chemin", "Voyage", "rêve", "Vie", "mort", "Joie", "tristesse",
    "Silence", "voix", "Regard", "sourire", "Colline", "vallée", "Prairie",
    "champ", "Jardin", "forgeron", "Ville", "village", "Maison", "porte",
    "Fenêtre", "chaise", "Table", "livre", "Page", "stylo", "Crayon", "papier",
    "Montre", "horloge", "Seconde", "minute", "Heure", "jour", "Nuit", "semaine",
    "Mois", "année", "Enfant", "adulte", "Femme", "homme", "Ami", "amie", "Voisin",
    "étranger", "Soldat", "roi", "Reine", "prince", "Princesse", "Peuple", "loi",
    "Justice", "paix", "Guerre", "armée", "Lutte", "travail", "Repos", "pensée",
    "Idée", "vérité", "Mensonge", "Route", "pont", "Port", "bateau", "Avion",
    "train", "Cheval", "chien", "Chat", "Poisson", "abeille", "Serpent", "loup"
]


In [None]:
def create_mock_data(number_of_samples = 20, embeddings_size = 128):
    n = number_of_samples
    df = pd.DataFrame({
        "id": [i for i in range(n)],
        "path_to_image": [f"word/image_{i:03d}.png" for i in range(n)],
        "label": [random.choice(french_words) for _ in range(n)],
        "embedding": [np.random.rand(embeddings_size).tolist() for _ in range(n)]
    })

    return df

In [44]:
def save_mock_to_parquet(df, output_dir):
    Path(output_dir).mkdir(exist_ok=True)

    meta_df = df[["id", "path_to_image", "label"]]
    emb_df = df[["id", "embedding"]]

    meta_path = Path(output_dir) / "metadata.parquet"
    emb_path = Path(output_dir) / "embeddings.parquet"

    meta_df.to_parquet(meta_path, engine="pyarrow", index=False)
    emb_df.to_parquet(emb_path, engine="pyarrow", index=False)

    print(f"Saved: {meta_path} - {emb_path}")


In [45]:
mock_df = create_mock_data()
save_mock_to_parquet(mock_df, METADATA_PATH)

Saved: ..\data\mock\metadata.parquet - ..\data\mock\embeddings.parquet


# Load embeddings and labels

In [46]:
def full_data_loading_from_parquet(path_to_data, fields_to_extract):
    df = pq.read_table(path_to_data, columns=fields_to_extract).to_pandas()
    
    return df

In [47]:
def batch_data_loading_from_parquet(path_to_data, fields_to_exctract):
    pass

In [48]:
def load_json(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    return data

In [49]:
labels_df = full_data_loading_from_parquet(METADATA_PATH + "/metadata.parquet", METADATA_FIELDS)

In [63]:
embeddings_df = full_data_loading_from_parquet(EMBEDDINGS_PATH + "/embeddings.parquet", EMBDDINGS_FIELDS)

ArrowInvalid: No match for FieldRef.Name(embedings) in id: string
embedding: list<element: double>
__fragment_index: int32
__batch_index: int32
__last_in_fragment: bool
__filename: string

In [50]:
labels_df

Unnamed: 0,id,path_to_image,label
0,img_000,word/image_000.png,sourire
1,img_001,word/image_001.png,fromage
2,img_002,word/image_002.png,Enfant
3,img_003,word/image_003.png,Ami
4,img_004,word/image_004.png,Justice
5,img_005,word/image_005.png,Étoile
6,img_006,word/image_006.png,papillon
7,img_007,word/image_007.png,semaine
8,img_008,word/image_008.png,Guerre
9,img_009,word/image_009.png,Jardin


In [51]:
symbols_list = load_json(SYMBOLS_PATH)

In [52]:
len(symbols_list)

94

# Data labeling

In [53]:
def mark_symbols(text, symbols):
    return {s: int(s in text) for s in symbols}

In [54]:
marked_df = labels_df["label"].apply(lambda x: mark_symbols(x, symbols_list))
marked_df = pd.DataFrame(list(marked_df))
final_df = pd.concat([labels_df, marked_df], axis=1)

In [55]:
final_df

Unnamed: 0,id,path_to_image,label,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,À,Â,Æ,Ç,É,È,Ê,Ë,Î,Ï,Ô,Œ,Ù,Û,Ü,Ÿ,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,à,â,æ,ç,é,è,ê,ë,î,ï,ô,œ,ù,û,ü,ÿ,0,1,2,3,4,5,6,7,8,9
0,img_000,word/image_000.png,sourire,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,img_001,word/image_001.png,fromage,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,img_002,word/image_002.png,Enfant,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,img_003,word/image_003.png,Ami,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,img_004,word/image_004.png,Justice,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,img_005,word/image_005.png,Étoile,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,img_006,word/image_006.png,papillon,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,img_007,word/image_007.png,semaine,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,img_008,word/image_008.png,Guerre,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,img_009,word/image_009.png,Jardin,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Feature extraction

In [65]:
def train_test_split_for_symbol(df, symbol, useful_columns, test_size = 0.2, random_state = 42):
    useful_columns_df = df[useful_columns]
    target_column = df[symbol]

    symbol_associated_df = pd.concat([useful_columns_df, target_column], keys=["id"])
    
    unique_values = target_column.dropna().unique()

    if len(unique_values) >= 2:
        train_df, test_df = train_test_split(
            symbol_associated_df, test_size=test_size, random_state=random_state, shuffle=True, stratify=target_column
        )
    else:
        print("Only one type of label")

    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)


In [None]:
# TODO: embeddings add to loop of spliting, connected with metadata and select features for training :((

In [None]:
for symbol in symbols_list:
    train_df, test_df = train_test_split_for_symbol(final_df, symbol, useful_columns=METADATA_FIELDS)
    train_df.to_csv(TRAIN_PATH + f"train_for_{symbol}")
    test_df.to_csv(TEST_PATH + f"test_for_{symbol}")

In [66]:
train_for_a_df, test_for_a_df = train_test_split_for_symbol(final_df, "a", METADATA_FIELDS) 

  symbol_associated_df = pd.concat([useful_columns_df, target_column], keys=["id"])


In [68]:
train_for_a_df

Unnamed: 0,id,path_to_image,label
0,img_019,word/image_019.png,Mois
1,img_002,word/image_002.png,Enfant
2,img_001,word/image_001.png,fromage
3,img_014,word/image_014.png,Serpent
4,img_010,word/image_010.png,bateau
5,img_003,word/image_003.png,Ami
6,img_000,word/image_000.png,sourire
7,img_007,word/image_007.png,semaine
8,img_008,word/image_008.png,Guerre
9,img_004,word/image_004.png,Justice


# Modelling

In [None]:
# need embeddings, fuck 

In [None]:
svm_model = svm.SVC()
svm_model.fit()

NameError: name 'svm' is not defined