In [1]:
from bs4 import BeautifulSoup
from fnmatch import fnmatch
from PIL import Image
from PIL import ImageFile
from pathlib import Path
from shutil import copyfile

import imagehash
import mysql.connector
import os
import re

In [2]:
# GENERAL CONFIGS

ImageFile.LOAD_TRUNCATED_IMAGES = True

# == Database config ==
conn = mysql.connector.connect(
  host="localhost",
  user="root",
  password="root"
)

cursor = conn.cursor()

DB_NAME = "blp_dataset"
TABLE_NAME = "products_tmp_2"


# == Files config ==
MAIN_FOLDER = r"D:\dnmarchives\novo\armory"

In [3]:
# Creating environment
cursor.execute("CREATE DATABASE IF NOT EXISTS {} DEFAULT CHARACTER SET 'utf8'".format(DB_NAME))
cursor.execute("USE {}".format(DB_NAME))

cursor.execute("DROP TABLE IF EXISTS products_tmp_2")

In [4]:
table_schema = (
    "CREATE TABLE IF NOT EXISTS products_tmp_2 ("
    "  id INT(11) AUTO_INCREMENT PRIMARY KEY NOT NULL,"
    "  name TEXT NOT NULL,"
    "  description TEXT,"
    "  price VARCHAR(50),"
    "  market_name VARCHAR(255),"
    "  seller_name TEXT,"
    "  has_image VARCHAR(25) NOT NULL DEFAULT 'has_image',"
    "  illegal ENUM('t', 'f') NOT NULL DEFAULT 't'"
    ") ENGINE=InnoDB")

cursor.execute(table_schema)

In [5]:
def update_image(product_id, value):
    query = "UPDATE products_tmp_2 SET has_image = %s WHERE id = %s"
    cursor.execute(query, (value, product_id))
    conn.commit()

In [6]:
def get_content(content):
    clear_content = ""

    if content is not None:
        raw_c = content.text

        clear_content = re.sub(r'[\t\r\n]', ' ', raw_c)
        clear_content = re.sub(' +',' ', clear_content).lstrip()
        
    return clear_content

In [7]:
folders = [x for x in Path(MAIN_FOLDER).iterdir() if x.is_dir()]

for fo in folders:
    folder = Path(f"{fo}\shop")
    folder_image = Path(f"{fo}")

    if folder.exists():
        for file in os.listdir(folder):
            page_content = BeautifulSoup(open(Path(f"{folder}\{file}"), encoding='utf8').read())

            if page_content is not None:
                ## GET DESCRIPTION
                description = ""
                raw_desc = page_content.find("div", attrs={"id": "tab-description"})
                
                if raw_desc is not None:
                    description = get_content(raw_desc)
                
                ## GET TITLE
                title = ""
                raw_title = page_content.find("title")
                
                if raw_title is not None:
                    title = get_content(raw_title)
                
                ## GET PRICE
                price = ""
                raw_price = page_content.find("div", attrs={"class": "price"})
                
                if raw_price is not None:
                    price_new = raw_price.find("span", attrs={"class": "price-new"})

                    if price_new is not None:
                        price = get_content(price_new)
                    else:
                        price = get_content(raw_price)

                    price = price.replace('Price:', '')
                    
                
                if description != "" or title != "":
                    # Insert product into database
                    query = "INSERT INTO products_tmp_2 (name, description, market_name, price, illegal) VALUES (%s, %s, %s, %s, %s)"
                    values = (title, description, "Armory", price, "t")
                    
                    cursor.execute(query, values)
                    conn.commit()

                    product_id = cursor.lastrowid
                    
                    ## GET IMAGE
                    img_content = page_content.find("img", attrs={"id": "image"})

                    if img_content is not None:
                        img = img_content["src"]
                        img = img.replace("http://armoryx7kvdq3jds.onion/shop/", "")
                        img = img.replace("http://armory34omvkkmgr.onion/", "")
                        img = img.replace("http://armory34omvkkmgr.onion/shop/", "")
                        
                        img_path = Path(f"{folder_image}\{img}")

                        if img_path.exists():
                            try:
                                # copy images to new directory, renaming with product id
                                print(f"Image {product_id} exists... creating")

                                dst_dir = Path(f"D:\images\p\{product_id}.jpg")
                                copyfile(img_path, dst_dir)
                            except:
                                print(f"Image {product_id} has error. Except")
                                update_image(product_id, "no_image")
                            
                        else:
                            print(f"Product {product_id} path image not found")
                            update_image(product_id, "no_image")                        
                    else:
                        print(f"Product {product_id} has no image")
                        update_image(product_id, "no_image")

Image 1 exists... creating
Image 2 exists... creating
Image 3 exists... creating
Image 4 exists... creating
Image 5 exists... creating
Image 6 exists... creating
Image 7 exists... creating
Image 8 exists... creating
Image 9 exists... creating
Image 10 exists... creating
Image 11 exists... creating
Image 12 exists... creating
Image 13 exists... creating
Image 14 exists... creating
Image 15 exists... creating
Image 16 exists... creating
Image 17 exists... creating
Image 18 exists... creating
Image 19 exists... creating
Image 20 exists... creating
Image 21 exists... creating
Image 22 exists... creating
Image 23 exists... creating
Image 24 exists... creating
Image 25 exists... creating
Image 26 exists... creating
Image 27 exists... creating
Image 28 exists... creating
Image 29 exists... creating
Image 30 exists... creating
Image 31 exists... creating
Image 32 exists... creating
Image 33 exists... creating
Image 34 exists... creating
Image 35 exists... creating
Image 36 exists... creating
I

Image 289 exists... creating
Image 290 exists... creating
Product 291 has no image
Image 292 exists... creating
Image 293 exists... creating
Image 294 exists... creating
Image 295 exists... creating
Product 296 path image not found
Product 297 path image not found
Product 298 path image not found
Product 299 path image not found
Product 300 path image not found
Product 301 path image not found
Product 302 path image not found
Product 303 path image not found
Image 304 exists... creating
Product 305 has no image
Image 306 exists... creating
Image 307 exists... creating
Image 308 exists... creating
Image 309 exists... creating
Image 310 exists... creating
Image 311 exists... creating
Image 312 exists... creating
Image 313 exists... creating
Image 314 exists... creating
Image 315 exists... creating
Image 316 exists... creating
Image 317 exists... creating
Image 318 exists... creating
Image 319 exists... creating
Image 320 exists... creating
Image 321 exists... creating
Image 322 exists...