In [1]:
from bs4 import BeautifulSoup
from fnmatch import fnmatch
from PIL import Image
from PIL import ImageFile
from pathlib import Path
from shutil import copyfile

import base64
import imagehash
import mysql.connector
import os
import re

In [2]:
# GENERAL CONFIGS

ImageFile.LOAD_TRUNCATED_IMAGES = True

# == Database config ==
conn = mysql.connector.connect(
  host="localhost",
  user="root",
  password="root"
)

cursor = conn.cursor()

DB_NAME = "blp_dataset"
TABLE_NAME = "products_tmp_bluesky"


# == Files config ==
MAIN_FOLDER = r"D:\dnmarchives\dnmarchives\bluesky"

In [3]:
# Creating environment
cursor.execute("CREATE DATABASE IF NOT EXISTS {} DEFAULT CHARACTER SET 'utf8'".format(DB_NAME))
cursor.execute("USE {}".format(DB_NAME))

cursor.execute("DROP TABLE IF EXISTS products_tmp_bluesky")

In [4]:
table_schema = (
    "CREATE TABLE IF NOT EXISTS products_tmp_bluesky ("
    "  id INT(11) AUTO_INCREMENT PRIMARY KEY NOT NULL,"
    "  name TEXT NOT NULL,"
    "  description TEXT,"
    "  price VARCHAR(50),"
    "  market_name VARCHAR(255),"
    "  seller_name TEXT,"
    "  has_image VARCHAR(25) NOT NULL DEFAULT 'has_image',"
    "  illegal ENUM('t', 'f') NOT NULL DEFAULT 't'"
    ") ENGINE=InnoDB")

cursor.execute(table_schema)

In [5]:
def update_image(product_id, value):
    query = "UPDATE products_tmp_bluesky SET has_image = %s WHERE id = %s"
    cursor.execute(query, (value, product_id))
    conn.commit()

In [6]:
def get_content(content):
    clear_content = ""

    if content is not None:
        raw_c = content.text

        clear_content = re.sub(r'[\t\r\n]', ' ', raw_c)
        clear_content = re.sub(' +',' ', clear_content).lstrip()
        
    return clear_content

In [7]:
folders = [x for x in Path(MAIN_FOLDER).iterdir() if x.is_dir()]

for fo in folders:
    folder = Path(f"{fo}\product\\view")
    folder_image = Path(f"{fo}")

    if folder.exists():
        for file in os.listdir(folder):
            page_content = BeautifulSoup(open(Path(f"{folder}\{file}"), encoding='utf8').read())

            if page_content is not None:
                ## GET DESCRIPTION
                description = ""
                raw_desc = page_content.find("div", attrs={"style": "margin-top:0px; "})

                if raw_desc is not None:
                    description = get_content(raw_desc)

                    if description is not None:
                        description = description.replace("Description: ", "")

                ## GET TITLE
                title = ""
                raw_title = page_content.find("div", attrs={"class": "prod_title"})
                
                if raw_title is not None:
                    title = get_content(raw_title)
                    
                if description != "" or title != "":
                    # Insert product into database
                    query = "INSERT INTO products_tmp_bluesky (name, description, market_name, illegal) VALUES (%s, %s, %s, %s)"
                    values = (title, description, "Blue Sky", "t")

                    cursor.execute(query, values)
                    conn.commit()

                    product_id = cursor.lastrowid

                    ## GET IMAGE
                    img_content = page_content.find("img")

                    if img_content is not None:
                        image = img_content["src"]

                        try:
                            data = "" + image.replace("data:image/jpeg;base64,", "")
                            imgdata = base64.b64decode(data)

                            img_filename = Path(f"D:\images\_bluesky\{product_id}.jpeg")

                            with open(img_filename, 'wb') as f:
                                f.write(imgdata)
                                f.close
                        except:
                            print(f"Image {product_id} has error. Except")
                            print(f"{folder_image}\{file}")
                            update_image(product_id, "no_image")
                    else:
                        print(f"Product {product_id} has no image")
                        update_image(product_id, "no_image")

Image 2 has error. Except
D:\dnmarchives\dnmarchives\bluesky\2014-01-06\00d4c99a
Image 38 has error. Except
D:\dnmarchives\dnmarchives\bluesky\2014-01-06\0d92bbd0
Image 54 has error. Except
D:\dnmarchives\dnmarchives\bluesky\2014-01-06\11d96716
Image 61 has error. Except
D:\dnmarchives\dnmarchives\bluesky\2014-01-06\14411bb9
Image 150 has error. Except
D:\dnmarchives\dnmarchives\bluesky\2014-01-06\2cb64d19
Image 198 has error. Except
D:\dnmarchives\dnmarchives\bluesky\2014-01-06\3cbbd7dd
Image 248 has error. Except
D:\dnmarchives\dnmarchives\bluesky\2014-01-06\544068f5
Image 279 has error. Except
D:\dnmarchives\dnmarchives\bluesky\2014-01-06\5ec554aa
Image 292 has error. Except
D:\dnmarchives\dnmarchives\bluesky\2014-01-06\60fc1736
Image 443 has error. Except
D:\dnmarchives\dnmarchives\bluesky\2014-01-06\99e71017
Image 536 has error. Except
D:\dnmarchives\dnmarchives\bluesky\2014-01-06\bd7e6b97
Image 547 has error. Except
D:\dnmarchives\dnmarchives\bluesky\2014-01-06\bf9761b0
Image 568