In [1]:
import os

os.chdir("../..")

In [2]:
!poetry add pytesseract opencv-python torch torchvision spacy gensim annoy


The following packages are already present in the pyproject.toml and will be skipped:

  • [36mpytesseract[39m
  • [36mopencv-python[39m
  • [36mtorch[39m
  • [36mtorchvision[39m
  • [36mspacy[39m
  • [36mgensim[39m

If you want to update it to the latest compatible version, you can use `poetry update package`.
If you prefer to upgrade it to the latest available version, you can use `poetry add package@latest`.

Nothing to add.


In [3]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m[36m0:00:01[0m:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [16]:
import torchvision.transforms as transforms
import torchvision.models as models
import torch
from torch.utils.data import Dataset
import cv2
import pytesseract
from src.shared.image import crop_image, display_image
import spacy
from src.shared.storage import Database, ImageStorage, DataFrameStorage
import os
import gensim.downloader as gensim_downloader
import numpy as np
from PIL import Image

In [6]:
image_storage = ImageStorage("cgc/classification", db=Database.SHARED_CGC)
df_storage = DataFrameStorage("cgc/classification", db=Database.SHARED_CGC)

df = df_storage.get("cards")
df = df[df['image_path'].apply(os.path.exists)]
df.head()

Unnamed: 0.1,Unnamed: 0,cert_#,label,image_path
0,0,4091732001,1999_English_Base Set - Unlimited_6/102_Gyarad...,./db/shared-cgc/jpg/cgc/classification/0_40917...
2,2,4077007001,2016_English_Evolutions_80/108_Misty's Determi...,./db/shared-cgc/jpg/cgc/classification/0_40770...
3,3,4084351001,2000_English_Base Set 2_10/130_Mewtwo_Holo_nan,./db/shared-cgc/jpg/cgc/classification/0_40843...
4,4,4079776001,2020_English_Darkness Ablaze_020/189_Charizard...,./db/shared-cgc/jpg/cgc/classification/0_40797...
5,5,4100449001,2021_English_Fusion Strike_255/264_Genesect V_...,./db/shared-cgc/jpg/cgc/classification/0_41004...


In [7]:
def visual_embed(image):
    model = models.resnet50(pretrained=True)
    model.eval()

    transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    input_image = transform(image)
    input_batch = input_image.unsqueeze(0)

    with torch.no_grad():
        features = model(input_batch)
        
    return features

In [23]:
model = gensim_downloader.load('word2vec-google-news-300')

def embed_text(document):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(document)
    embeddings = [token.vector for token in doc]
    return embeddings

def text_to_vector(text):
    words = text.split()
    word_vectors = [model[w] for w in words if w in model]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)  # Return zero vector of the correct size
    vector = np.mean(word_vectors, axis=0)
    return vector


In [140]:
class OCRDataset(Dataset):
    def __init__(self, image_paths, labels):
        self.image_paths = image_paths
        self.labels = labels

    def __len__(self):
        return len(self.image_paths)
    
    def _preprocess_crop(self, image):
        height, width = image.shape[:2]
        top_crop_height = int(height * 0.25)
        cropped_img = image[top_crop_height:, :]
        return cropped_img

    def __getitem__(self, index):
        # Get label
        label = self.labels[index]

        # Get image
        image_path = self.image_paths[index]
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = self._preprocess_crop(image)
        image = torch.Tensor(visual_embed(image)).T

        # Process text embedding
        image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
        text = pytesseract.image_to_string(image_pil)
        text = torch.Tensor(text_to_vector(text)).unsqueeze(1)
        
        # Create aggregate embedding
        embedding = torch.cat([image, text], dim=0)
        return embedding, label
    
    def __iter__(self):
        for i in range(len(self)):
            yield self[i]

    
df_1999 = df[df['label'].str.contains('1999')]
label_counts = df_1999.groupby('label').size().reset_index(name='count')
filtered_labels = label_counts[label_counts['count'] > 1]
filtered_df_1999 = df_1999[df_1999['label'].isin(filtered_labels['label'])]
image_paths = list(filtered_df_1999['image_path'])
labels = list(filtered_df_1999['label'])
ocr_dataset = OCRDataset(image_paths, labels)
ocr_dataset[0]

  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))


(tensor([[-2.8911],
         [ 0.4824],
         [-1.8819],
         ...,
         [ 0.0000],
         [ 0.0000],
         [ 0.0000]]),
 '1999_English_Base Set - Unlimited_6/102_Gyarados_Holo_nan')

In [141]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

X, y = [], []

for feature, label in tqdm(ocr_dataset, desc="Processing dataset"):
    try:
        X.append(feature)
        # disable extremely rare variants
        label = "_".join(label.split("_")[:-2])
        y.append(label)
    except:
        pass

  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.fromarray((image.numpy() * 255).astype(np.uint8))
  image_pil = Image.

In [167]:
unique_elements, counts = np.unique(list(map(lambda x: x[1], data)), return_counts=True)
sorted_indices = np.argsort(counts)
sorted_unique_elements = unique_elements[sorted_indices]
sorted_counts = counts[sorted_indices]

print(len([count == 1 for count in sorted_counts]))

39


In [166]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
encoded_labels = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    np.stack([d.numpy().flatten() for d in X]), 
    np.array(encoded_labels), 
    test_size=0.1,
)

In [158]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6791208791208792


In [183]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from collections import defaultdict

le = LabelEncoder()
encoded_labels = le.fit_transform(y)

reshaped_data = np.stack([d.numpy().flatten() for d in X])

indices_dict = defaultdict(list)
for index, label in enumerate(encoded_labels):
    indices_dict[label].append(index)

train_indices = []
test_indices = []

for indices in indices_dict.values():
    # Don't do cards with less than 5 images
    if len(indices) > 5:
        train, test = train_test_split(indices, test_size=0.2)
        train_indices.extend(train)
        test_indices.extend(test)
    else:
        train_indices.extend(indices)

X_train, y_train = reshaped_data[train_indices], encoded_labels[train_indices]
X_test, y_test = reshaped_data[test_indices], encoded_labels[test_indices]


In [184]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7660332541567696
