## Extract

In [13]:
import os
from dotenv import load_dotenv
from pokemontcgsdk import RestClient, Card, Set
import pandas as pd
import requests
import json
import re
import ast


# Load .env variables
load_dotenv()

# Get the key from environment
api_key = os.getenv('POKEMONTCG_API_KEY')

# Configure RestClient
RestClient.configure(api_key)

In [None]:
sets_df = Set.all()
fields = Set.__match_args__

# write the sets_df to a CSV file
sets_df_df = pd.DataFrame(sets_df, columns=fields)
sets_df_df.to_csv('setsRaw.csv', index=False)

# Print the number of sets_df and the first few rows of the DataFrame
print(f"Number of sets_df: {len(sets_df_df)}")
print(sets_df_df.head())

Number of sets_df: 165
      id                                             images  \
0  base1  {'symbol': 'https://images.pokemontcg.io/base1...   
1  base2  {'symbol': 'https://images.pokemontcg.io/base2...   
2  basep  {'symbol': 'https://images.pokemontcg.io/basep...   
3  base3  {'symbol': 'https://images.pokemontcg.io/base3...   
4  base4  {'symbol': 'https://images.pokemontcg.io/base4...   

                                          legalities  \
0  {'unlimited': 'Legal', 'expanded': None, 'stan...   
1  {'unlimited': 'Legal', 'expanded': None, 'stan...   
2  {'unlimited': 'Legal', 'expanded': None, 'stan...   
3  {'unlimited': 'Legal', 'expanded': None, 'stan...   
4  {'unlimited': 'Legal', 'expanded': None, 'stan...   

                        name  printedTotal ptcgoCode releaseDate series  \
0                       Base           102        BS  1999/01/09   Base   
1                     Jungle            64        JU  1999/06/16   Base   
2  Wizards Black Star Promos        

In [15]:
cards = Card.all() # 11 minute request time

In [None]:
fields = Card.__match_args__
selected_fields = ['id', 'name', 'hp', 'types', 'attacks']

# write the cards to a CSV file
cards_df = pd.DataFrame(cards, columns=fields)
cards_df.to_csv('cardsRaw.csv', index=False)

# Print the number of cards and the first few rows of the DataFrame
print(f"Number of cards: {len(cards_df)}")
print(cards_df.head())

## Tranform

In [None]:
# Set cleanining
sets_df = pd.read_csv('setsRaw.csv')
print(sets_df.dtypes)

sets_df['images'] = sets_df['images'].apply(ast.literal_eval)
sets_df['symbol_url'] = sets_df['images'].apply(lambda x: x.get('symbol'))
sets_df['logo_url'] = sets_df['images'].apply(lambda x: x.get('logo'))

sets_df['legalities'] = sets_df['legalities'].apply(ast.literal_eval)
sets_df['standard'] = sets_df['legalities'].apply(lambda x: x.get('standard'))
sets_df['expanded'] = sets_df['legalities'].apply(lambda x: x.get('expanded'))
sets_df['unlimited'] = sets_df['legalities'].apply(lambda x: x.get('unlimited'))

sets_df.drop(columns='images', inplace=True)
sets_df.drop(columns='legalities', inplace=True)

sets_df.to_csv('setsCleaned.csv', index=False)

id              object
images          object
legalities      object
name            object
printedTotal     int64
ptcgoCode       object
releaseDate     object
series          object
total            int64
updatedAt       object
dtype: object


In [None]:
# Data exploration to undersatnd how to clean the cards data
cards_df = pd.read_csv('cardsRaw.csv')

Index(['abilities', 'artist', 'ancientTrait', 'attacks', 'cardmarket',
       'convertedRetreatCost', 'evolvesFrom', 'flavorText', 'hp', 'id',
       'images', 'legalities', 'regulationMark', 'name',
       'nationalPokedexNumbers', 'number', 'rarity', 'resistances',
       'retreatCost', 'rules', 'set', 'subtypes', 'supertype', 'tcgplayer',
       'types', 'weaknesses'],
      dtype='object')


AttributeError: 'Series' object has no attribute 'columns'

In [None]:
cards_df = pd.read_csv('cardsRaw.csv')

attacks_data = []
cardmarket_data = []
small_images_data = []
large_images_data = []
resistances_data = []
# legalities_data is already covered in set data
for idx, row in cards_df.iterrows():

Unnamed: 0,abilities,artist,ancientTrait,attacks,cardmarket,convertedRetreatCost,evolvesFrom,flavorText,hp,id,...,rarity,resistances,retreatCost,rules,set,subtypes,supertype,tcgplayer,types,weaknesses
0,,Kagemaru Himeno,,"[{'name': 'Second Strike', 'cost': ['Metal', '...",{'url': 'https://prices.pokemontcg.io/cardmark...,4.0,Lairon,You can tell its age by the length of its iron...,140.0,hgss4-1,...,Rare Holo,"[{'type': 'Psychic', 'value': '-20'}]","['Colorless', 'Colorless', 'Colorless', 'Color...",,"{'id': 'hgss4', 'images': {'symbol': 'https://...",['Stage 2'],Pokémon,{'url': 'https://prices.pokemontcg.io/tcgplaye...,['Metal'],"[{'type': 'Fire', 'value': '×2'}]"


In [None]:
# Card image download


def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "", name)

sets_download = []

#filter for cards where the set starts with 'sv'
cards_df = cards_df[cards_df['set'].str.startswith('sv')]
for idx, row in cards_df.iterrows():
    if row['images'] == 'None':
        continue
    images = ast.literal_eval(row['images'])
    image_url = images.get('small')
    if image_url is None:
        continue
    image_name = sanitize_filename(row['name'])
    image_name = f"{image_name}_{row['id']}.jpg"
    image_path = os.path.join('images', image_name)
    if not os.path.exists(image_path):
        try:
            response = requests.get(image_url, stream=True)
            if response.status_code == 200:
                with open(image_path, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                print(f"Downloaded {image_name}")
            else:
                print(f"Failed to download {image_name}: {response.status_code}")
        except Exception as e:
            print(f"Error downloading {image_name}: {e}")
    else:
        print(f"{image_name} already exists, skipping download.")

In [29]:
import os
import imagehash
from PIL import Image
import pickle

# Function to hash an image
def hash_image(image_path):
    with Image.open(image_path) as img:
        return imagehash.phash(img)

# Folder containing the images
folder_path = "images"
image_hashes = {}

# Hash all images in the folder
for filename in os.listdir(folder_path):
    if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
        image_path = os.path.join(folder_path, filename)
        image_hash = hash_image(image_path)
        image_hashes[filename] = image_hash

# Save image hashes to Pickle file
with open('image_hashes.pkl', mode='wb') as file:
    pickle.dump(image_hashes, file)

print("Hashes have been saved to Pickle format.")


Hashes have been saved to Pickle format.
