In [1]:
from bs4 import BeautifulSoup
from fnmatch import fnmatch
from PIL import Image
from PIL import ImageFile
from pathlib import Path
from shutil import copyfile

import imagehash
import mysql.connector
import os
import re

In [2]:
# GENERAL CONFIGS
ImageFile.LOAD_TRUNCATED_IMAGES = True

# == Database config ==
conn = mysql.connector.connect(
  host="localhost",
  user="root",
  password="root"
)

cursor = conn.cursor()

DB_NAME = "blp_dataset"
TABLE_NAME = "products_tmp_freebay"


# == Files config ==
MAIN_FOLDER = r"D:\dnmarchives\dnmarchives\freebay"

In [3]:
# Creating environment
cursor.execute("CREATE DATABASE IF NOT EXISTS {} DEFAULT CHARACTER SET 'utf8'".format(DB_NAME))
cursor.execute("USE {}".format(DB_NAME))

In [4]:
table_schema = (
    "CREATE TABLE IF NOT EXISTS products_tmp_freebay ("
    "  id INT(11) AUTO_INCREMENT PRIMARY KEY NOT NULL,"
    "  name TEXT NOT NULL,"
    "  description TEXT,"
    "  market_name VARCHAR(255),"
    "  seller_name TEXT,"
    "  illegal ENUM('t', 'f') NOT NULL DEFAULT 't'"
    ") ENGINE=InnoDB")

cursor.execute(table_schema)

In [5]:
def get_content(content):
    clear_content = ""

    if content is not None:
        raw_c = content.text

        clear_content = re.sub(r'[\t\r\n]', ' ', raw_c)
        clear_content = re.sub(' +',' ', clear_content).lstrip()
        
    return clear_content

In [6]:
original_images = []

folders = [x for x in Path(MAIN_FOLDER).iterdir() if x.is_dir()]

for fo in folders:
    folder = Path(f"{fo}\item")
    folder_image = Path(f"{fo}")

    if folder.exists():
        for file in os.listdir(folder):
            page_content = BeautifulSoup(open(Path(f"{folder}\{file}"), encoding='utf8').read())

            if page_content is not None:
                ## GET DESCRIPTION
                description = ""
                desc_content = page_content.find("div", attrs={"class": "description"})
                
                if desc_content is not None:
                    description = get_content(desc_content)
    
                ## GET TITLE
                title = ""
                raw_title = page_content.find("title")

                if raw_title is not None:
                    title = get_content(raw_title)

                ## GET SELLER
                seller = ""
                raw_seller = page_content.find("table", attrs={"class": "table item_table"})

                if raw_seller is not None:
                    raw_seller_tr = raw_seller.findAll("tr")[0]

                    if raw_seller_tr is not None:
                        seller_content = raw_seller_tr.findAll("td")[1]
                        seller = get_content(seller_content).replace("(0)", "")
                
                if description != "" or title != "":
                    # Insert product into database
                    query = "INSERT INTO products_tmp_freebay (name, description, market_name, seller_name, illegal) VALUES (%s, %s, %s, %s, %s)"
                    values = (title, description, "Freebay", seller, "t")
                    
                    cursor.execute(query, values)
                    conn.commit()

                    product_id = cursor.lastrowid
                    
                    ## GET IMAGE
                    raw_name = page_content.find("img")
                    name = ""
                    
                    if raw_name is not None:
                        name = raw_name["src"]
                        
                        if name is not "":
                            url_image = Path(f"{folder_image}\{name}")
                            
                            try:
                                # copy images to new directory, renaming with product id
                                if url_image.exists():
                                    print(f"Image {product_id} exists... creating")

                                    dst_dir = Path(f"D:\images\_freebay\{product_id}.jpg")
                                    copyfile(url_image, dst_dir)
                                else:
                                    print(f"Image {product_id} does not exists")
                            except:
                                print(f"Image {product_id} has error. Except")

Image 1 exists... creating
Image 2 exists... creating
Image 3 exists... creating
Image 4 exists... creating
Image 5 exists... creating
Image 6 exists... creating
Image 7 exists... creating
Image 8 exists... creating
Image 9 exists... creating
Image 10 exists... creating
Image 11 exists... creating
Image 12 exists... creating
Image 13 exists... creating
Image 14 does not exists
Image 15 exists... creating
Image 16 exists... creating
Image 17 exists... creating
Image 18 exists... creating
Image 19 exists... creating
Image 20 exists... creating
Image 21 exists... creating
Image 22 exists... creating
Image 23 exists... creating
Image 24 exists... creating
Image 25 exists... creating
Image 26 exists... creating
Image 27 exists... creating
Image 28 exists... creating
Image 29 exists... creating
Image 30 exists... creating
Image 31 exists... creating
Image 32 exists... creating
Image 33 exists... creating
Image 34 exists... creating
Image 35 exists... creating
Image 36 exists... creating
Imag

Image 295 exists... creating
Image 296 exists... creating
Image 297 exists... creating
Image 298 does not exists
Image 299 does not exists
Image 300 does not exists
Image 301 does not exists
Image 302 exists... creating
Image 303 does not exists
Image 304 exists... creating
Image 305 exists... creating
Image 306 does not exists
Image 307 exists... creating
Image 308 exists... creating
Image 309 does not exists
Image 310 exists... creating
Image 311 exists... creating
Image 312 exists... creating
Image 313 exists... creating
Image 314 exists... creating
Image 315 exists... creating
Image 316 exists... creating
Image 317 exists... creating
Image 318 exists... creating
Image 319 exists... creating
Image 320 exists... creating
Image 321 exists... creating
Image 322 exists... creating
Image 323 does not exists
Image 324 exists... creating
Image 325 exists... creating
Image 326 exists... creating
Image 327 exists... creating
Image 328 exists... creating
Image 329 exists... creating
Image 330

Image 589 exists... creating
Image 590 exists... creating
Image 591 does not exists
Image 592 exists... creating
Image 593 exists... creating
Image 594 exists... creating
Image 595 exists... creating
Image 596 exists... creating
Image 597 exists... creating
Image 598 exists... creating
Image 599 exists... creating
Image 600 exists... creating
Image 601 exists... creating
Image 602 exists... creating
Image 603 exists... creating
Image 604 exists... creating
Image 605 exists... creating
Image 606 exists... creating
Image 607 exists... creating
Image 608 exists... creating
Image 609 exists... creating
Image 610 exists... creating
Image 611 exists... creating
Image 612 exists... creating
Image 613 exists... creating
Image 614 exists... creating
Image 615 exists... creating
Image 616 does not exists
Image 617 does not exists
Image 618 does not exists
Image 619 does not exists
Image 620 exists... creating
Image 621 does not exists
Image 622 exists... creating
Image 623 exists... creating
Ima

Image 890 exists... creating
Image 891 exists... creating
Image 892 exists... creating
Image 893 exists... creating
Image 894 exists... creating
Image 895 exists... creating
Image 896 exists... creating
Image 897 does not exists
Image 898 exists... creating
Image 899 exists... creating
Image 900 does not exists
Image 901 does not exists
Image 902 does not exists
Image 903 does not exists
Image 904 does not exists
Image 905 does not exists
Image 906 does not exists
Image 907 does not exists
Image 908 does not exists
Image 909 does not exists
Image 910 exists... creating
Image 911 exists... creating
Image 912 exists... creating
Image 913 exists... creating
Image 914 exists... creating
Image 915 exists... creating
Image 916 exists... creating
Image 917 exists... creating
Image 918 exists... creating
Image 919 exists... creating
Image 920 exists... creating
Image 921 exists... creating
Image 922 exists... creating
Image 923 exists... creating
Image 924 exists... creating
Image 925 exists..

Image 1185 exists... creating
Image 1186 exists... creating
Image 1187 exists... creating
Image 1188 exists... creating
Image 1189 exists... creating
Image 1190 exists... creating
Image 1191 exists... creating
Image 1192 exists... creating
Image 1193 does not exists
Image 1194 exists... creating
Image 1195 exists... creating
Image 1196 exists... creating
Image 1197 exists... creating
Image 1198 exists... creating
Image 1199 exists... creating
Image 1200 exists... creating
Image 1201 exists... creating
Image 1202 exists... creating
Image 1203 exists... creating
Image 1204 exists... creating
Image 1205 does not exists
Image 1206 does not exists
Image 1207 does not exists
Image 1208 exists... creating
Image 1209 exists... creating
Image 1210 does not exists
Image 1211 does not exists
Image 1212 exists... creating
Image 1213 exists... creating
Image 1214 exists... creating
Image 1215 exists... creating
Image 1216 exists... creating
Image 1217 exists... creating
Image 1218 exists... creatin

Image 1463 exists... creating
Image 1464 exists... creating
Image 1465 exists... creating
Image 1466 exists... creating
Image 1467 does not exists
Image 1468 does not exists
Image 1469 does not exists
Image 1470 does not exists
Image 1471 does not exists
Image 1472 does not exists
Image 1473 does not exists
Image 1474 does not exists
Image 1475 does not exists
Image 1476 does not exists
Image 1477 exists... creating
Image 1478 does not exists
Image 1479 does not exists
Image 1480 exists... creating
Image 1481 exists... creating
Image 1482 exists... creating
Image 1483 does not exists
Image 1484 exists... creating
Image 1485 exists... creating
Image 1486 does not exists
Image 1487 does not exists
Image 1488 exists... creating
Image 1489 exists... creating
Image 1490 exists... creating
Image 1491 does not exists
Image 1492 exists... creating
Image 1493 exists... creating
Image 1494 exists... creating
Image 1495 exists... creating
Image 1496 exists... creating
Image 1497 does not exists
I

Image 1754 exists... creating
Image 1755 exists... creating
Image 1756 exists... creating
Image 1757 exists... creating
Image 1758 exists... creating
Image 1759 exists... creating
Image 1760 exists... creating
Image 1761 exists... creating
Image 1762 exists... creating
Image 1763 does not exists
Image 1764 exists... creating
Image 1765 exists... creating
Image 1766 exists... creating
Image 1767 does not exists
Image 1768 exists... creating
Image 1769 exists... creating
Image 1770 does not exists
Image 1771 exists... creating
Image 1772 exists... creating
Image 1773 exists... creating
Image 1774 exists... creating
Image 1775 exists... creating
Image 1776 exists... creating
Image 1777 exists... creating
Image 1778 exists... creating
Image 1779 exists... creating
Image 1780 exists... creating
Image 1781 exists... creating
Image 1782 exists... creating
Image 1783 exists... creating
Image 1784 exists... creating
Image 1785 exists... creating
Image 1786 exists... creating
Image 1787 exists..

Image 2040 exists... creating
Image 2041 does not exists
Image 2042 exists... creating
Image 2043 does not exists
Image 2044 exists... creating
Image 2045 does not exists
Image 2046 exists... creating
Image 2047 exists... creating
Image 2048 exists... creating
Image 2049 exists... creating
Image 2050 exists... creating
Image 2051 exists... creating
Image 2052 exists... creating
Image 2053 exists... creating
Image 2054 exists... creating
Image 2055 exists... creating
Image 2056 exists... creating
Image 2057 exists... creating
Image 2058 exists... creating
Image 2059 exists... creating
Image 2060 exists... creating
Image 2061 exists... creating
Image 2062 does not exists
Image 2063 does not exists
Image 2064 does not exists
Image 2065 does not exists
Image 2066 exists... creating
Image 2067 exists... creating
Image 2068 exists... creating
Image 2069 exists... creating
Image 2070 does not exists
Image 2071 exists... creating
Image 2072 exists... creating
Image 2073 exists... creating
Imag

Image 2324 exists... creating
Image 2325 exists... creating
Image 2326 does not exists
Image 2327 exists... creating
Image 2328 does not exists
Image 2329 exists... creating
Image 2330 exists... creating
Image 2331 exists... creating
Image 2332 exists... creating
Image 2333 exists... creating
Image 2334 exists... creating
Image 2335 exists... creating
Image 2336 exists... creating
Image 2337 exists... creating
Image 2338 exists... creating
Image 2339 exists... creating
Image 2340 exists... creating
Image 2341 exists... creating
Image 2342 exists... creating
Image 2343 exists... creating
Image 2344 does not exists
Image 2345 does not exists
Image 2346 exists... creating
Image 2347 exists... creating
Image 2348 exists... creating
Image 2349 exists... creating
Image 2350 exists... creating
Image 2351 exists... creating
Image 2352 exists... creating
Image 2353 exists... creating
Image 2354 exists... creating
Image 2355 does not exists
Image 2356 exists... creating
Image 2357 exists... crea