## Fetch art piece specific url sites, extract all useful info

In [1]:
from urllib.request import urlopen, URLError, HTTPError
from bs4 import BeautifulSoup
from socket import timeout

base_url = "https://vernissage.ee"
def get_soup(url: str):
    page = urlopen(url, timeout=1)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    return soup

def get_soup_persistent(url: str):
    haus_year_soup = None
    while True:
        try:
            haus_year_soup = get_soup(url)
            return haus_year_soup
        except timeout as e:
            print(e)
        except HTTPError as e:
            if e.code == 404:
                return None
        except URLError as e:
            print(e)  

In [87]:
import re
def get_vernissage_all():
    art_pieces = []
    page = 1
    while True:
        page_url_end = f"/tootekategooria/oksjonid/page/{page}"
        # Connect to website
        page_url = base_url + page_url_end
        vern_auction_soup = get_soup_persistent(page_url)
        if vern_auction_soup is None:
            break
        page_items = vern_auction_soup.find("div", {"class":"archive-products"}).ul.script
        page_items = str(page_items)
        # Get all art pieces
        #print(page_items)
        img_urls_raw = re.findall(r"src=(.*?)class", page_items) # 2 different size images per 1 art piece
        info_urls_raw = re.findall(r"<a  href=(.*?)>", page_items) # Link to art piece web page
        fields_raw = re.findall(r"&ldquo;(.*?)&rdquo;", page_items)
        #rel=\"tag\">S\u00fcgisoksjon 2021<\/a>
        auction_names = re.findall(r'tag\\">(.*)<', page_items)
        print(len(auction_names))
        #print(auction_names[0].split("<", 1)[0])
        
        print(page_url)
        counter = 1
        unicode_table = {"\\u00d5":"Õ", "\\u00f5":"õ", 
                         "\\u20ac": "€", "\\u00d6":"Ö", 
                         "\\u00f6":"ö", "\\u00e4":"ä",
                        "\\u00fc":"ü", "\\u00a0":" ",
                        "\\u017e": "ž", "\\u00dc": "Ü",
                        "\\u00e0": "à", "\\u00e9":"é", 
                         "\\u0161":"š", "\\u00d7":"x"}
        for i in range(len(fields_raw)):
            art_piece_obj = dict()
            #print(counter)

            # Extract image url
            img_url = img_urls_raw[i].replace("\\", "").replace('"', "")
            art_piece_obj["img"] = img_url
            
            # Extract more info url
            link_url = info_urls_raw[i].replace("\\", "").replace('"', "")
            art_piece_obj["href"] = link_url
            
            # Extract fields
            item = fields_raw[i]
            for key in unicode_table:
                item = item.replace(key, unicode_table[key])
            item = item.replace("\\", "")
            item = re.sub(r"(^(\d)+\.)", "", item) # remove leading number
            splits = item.split(".", 4)
            print(item.count("."), item.count(","))
            art_piece_obj["full"] = item
            art_piece_obj["author"] = splits[0]
            art_piece_obj["title"] = splits[1] if len(splits) > 1 else None
            art_piece_obj["year"] = splits[2] if len(splits) > 2 else None
            art_piece_obj["tech"] = splits[3] if len(splits) > 3 else None
            if len(splits) > 4:
                art_piece_obj["size_prices"] = splits[4].split("hind", 1) 
                art_piece_obj["size"] = art_piece_obj["size_prices"][0]
                if len(art_piece_obj["size_prices"]) > 1:
                    prices = art_piece_obj["size_prices"][1].split("hind")
                    print(item)
                    start_price = prices[0]
                    end_price = prices[1] if len(prices) == 2 else None
                    art_piece_obj["start_price"] = start_price
                    art_piece_obj["end_price"] = end_price
            
            art_pieces.append(art_piece_obj)

            counter += 1
        #break for faster testing
        page += 1

    return art_pieces
data = get_vernissage_all()


1
https://vernissage.ee/tootekategooria/oksjonid/page/1
5 1
Ado Lill. Kontrastne. 1974. Õli, masoniit. 150 x 120 cm . Alghind: 12000 €  Haamrihind: 23000 €
5 1
Agur Kruusing. Tennis on parem kui seks. 2004. Õli, lõuend. 130 x 100 cm. Alghind: 2500 €  Haamrihind: 3300 €
5 1
Aili Vint. Öö merel. 1983. Õli, lõuend. 35 x 45 cm. Alghind: 9000 € Haamrihind: 18500 €
9 1
Aleksander Möldroo. Vaade Rannavärava mäelt. 1959. Monotüüpia, paber. Lm 71 x 50.4cm. Km 64.7 x 44.2 cm. Alghind: 900 € Haamrihind: 900 €
7 2
Aleksander Vardi. Itaalia komöödia. 1962. Monotüüpia, õli, paber. km 44.6 x 35.8 cm. Alghind: 5500 € Haamrihind: 6000 €
7 1
Andrei Jegorov. Kalurid  kaldal. 1946. Õli, vineer. 57.7 x 80.1 cm.  Alghind: 8500 € Haamrihind: 10500 €
2 4
4 3
Andrus Kasemaa. Põimuvad kehad. 1992. Pastell, süsi, paber. Vm 55 x 80,3cm  Alghind: 2800 € Haamrihind:  2800 €
7 1
Ants Erik Vomm. Jaapani tantsitar II. U.1970. Õli, papp. 100 x 77.5 cm. Alghind: 1200 € Haamrihind:  1200 €
9 1
Ants Murakin. Rukkihakid. 1

1
https://vernissage.ee/tootekategooria/oksjonid/page/4
6 1
Peeter Mudist. Saaremaa maastik. 1980.aastad. Õli, papp. 50 x 70 cm. Alghind: 7000 €  Haamrihind: (Müümata)
7 1
Peeter Ulas. Otepää maastik. 1972. Õli, lõuend. 60.2 x 90.2 cm. Alghind: 2000€ Haamrihind: (Müümata)
5 2
Philiph Arvo Luik. Ideest valmis pildini.1981. 60 x 143 cm. Õli, lehekuld, lõuend. Alghind: 5200 € Haamrihind: (Müümata)
5 1
Priit Pajos. Ohvrivaras. 2004. Õli, vineer. 86 x 73 cm. Alghind: 3200 €  Haamrihind: (Müümata)
5 1
Priit Pajos. Põld. 2004. Õli, papp. 46 x 55 cm. Alghind: 2000 € Haamrihind: (Müümata)
6 1
Raivo Korstnik. Akt. 1965. Õli, kartong. 35.1 x 50 cm. Alghind: 1900 € Haamrihind: 3700 €
5 1
Raivo Korstnik. Akt.1992. Õli, lõuend. 63 x 60 cm. Alghind: 3500 € Haamrihind: 3500 €
7 1
Raivo Korstnik. Kohtla-Järve motiiv.1964. Õli, papp. 49.7 x 69.7 cm. Alghind: 4000 € Haamrihind: (Müümata)
7 1
Rein Raamat (1931). Vana kaluri jutud. 1957. Õli, lõuend. 51.8 x 80.4 cm. Alghind: 3500 € Haamrihind: (Müüdud)
6 1

1
https://vernissage.ee/tootekategooria/oksjonid/page/7
6 3
 Jüri Kask. Eneseleidmine. Kavand. 1978. Guašš, tempera. Km 84,3 x 99,7. Alghind: 2100 € Haamrihind: 4200 €
5 2
 Nikolai Kormašov. Võsu rand. 1961. Õlimaal. 49,5 x 69,1. Alghind: 3900 € Haamrihind: 3900 €
5 4
 Aleksander Vardi. Arabeskne II. 1968. Guašš, akvarell,pliiats. Vm 31,5 x 47,5. Alghind: 3200 € Haamrihind: 3200 €
6 1
 Henn-Olavi Roode. Kompositsioon. 1965-68. Õli, pliiats. 33.5 x 63. Alghind: 1500 € Haamrihind: 1950 €
5 0
 Uno Roosvalt. Talveõhtu. 1974. Õlimaal. 117 x 117. Alghind: 5800€ Haamrihind: 5800€
6 0
 Mari Kurismaa. Kaksikmaal. 1990.aastad. Õlimaal. Mõlemad osad 160 x 120. Alghind: 6000 €
5 0
 Mari Roosvalt. Nurgad. 1987. Õlimaal. 119 x 90. Alghind: 5400 € Haamrihind: 6200 €
5 2
 Roman Nyman. Pojengid. 1921. Õlimaal. 45,7 x 45,3. Alghind: 3500 € Haamrihind: 14500 €
5 2
 Malle Leis. Lilled ja viljad maastikuga. 1985. Akvarell. Lm 59,7 x 59,7. Alghind: 2700 € Haamrihind: 5400 €
6 0
 Johannes Võerahansu. Natüürm

## Create df

In [95]:
import pandas as pd

df = pd.DataFrame(data)
df.head()

Unnamed: 0,author,end_price,full,href,img,size,size_prices,start_price,tech,title,year
0,Ado Lill,: 23000 €,"Ado Lill. Kontrastne. 1974. Õli, masoniit. 150...",https://vernissage.ee/toode/ado-lill-kontrastn...,https://vernissage.ee/wp-content/uploads/2021/...,150 x 120 cm . Alg,"[ 150 x 120 cm . Alg, : 12000 € Haamrihind: 2...",: 12000 € Haamri,"Õli, masoniit",Kontrastne,1974
1,Agur Kruusing,: 3300 €,Agur Kruusing. Tennis on parem kui seks. 2004....,https://vernissage.ee/toode/agur-kruusing-tenn...,https://vernissage.ee/wp-content/uploads/2021/...,130 x 100 cm. Alg,"[ 130 x 100 cm. Alg, : 2500 € Haamrihind: 330...",: 2500 € Haamri,"Õli, lõuend",Tennis on parem kui seks,2004
2,Aili Vint,: 18500 €,"Aili Vint. Öö merel. 1983. Õli, lõuend. 35 x 4...",https://vernissage.ee/toode/aili-vint-oo-merel...,https://vernissage.ee/wp-content/uploads/2021/...,35 x 45 cm. Alg,"[ 35 x 45 cm. Alg, : 9000 € Haamrihind: 18500 €]",: 9000 € Haamri,"Õli, lõuend",Öö merel,1983
3,Aleksander Möldroo,: 900 €,Aleksander Möldroo. Vaade Rannavärava mäelt. 1...,https://vernissage.ee/toode/aleksander-moldroo...,https://vernissage.ee/wp-content/uploads/2021/...,Lm 71 x 50.4cm. Km 64.7 x 44.2 cm. Alg,"[ Lm 71 x 50.4cm. Km 64.7 x 44.2 cm. Alg, : 90...",: 900 € Haamri,"Monotüüpia, paber",Vaade Rannavärava mäelt,1959
4,Aleksander Vardi,: 6000 €,Aleksander Vardi. Itaalia komöödia. 1962. Mono...,https://vernissage.ee/toode/aleksander-vardi-i...,https://vernissage.ee/wp-content/uploads/2021/...,km 44.6 x 35.8 cm. Alg,"[ km 44.6 x 35.8 cm. Alg, : 5500 € Haamrihind:...",: 5500 € Haamri,"Monotüüpia, õli, paber",Itaalia komöödia,1962


In [96]:
pd.options.display.max_colwidth = 150
print(df["full"].head())
pd.options.display.max_colwidth = 50


0                                              Ado Lill. Kontrastne. 1974. Õli, masoniit. 150 x 120 cm . Alghind: 12000 €  Haamrihind: 23000 €
1                                Agur Kruusing. Tennis on parem kui seks. 2004. Õli, lõuend. 130 x 100 cm. Alghind: 2500 €  Haamrihind: 3300 €
2                                                      Aili Vint. Öö merel. 1983. Õli, lõuend. 35 x 45 cm. Alghind: 9000 € Haamrihind: 18500 €
3    Aleksander Möldroo. Vaade Rannavärava mäelt. 1959. Monotüüpia, paber. Lm 71 x 50.4cm. Km 64.7 x 44.2 cm. Alghind: 900 € Haamrihind: 900 €
4                      Aleksander Vardi. Itaalia komöödia. 1962. Monotüüpia, õli, paber. km 44.6 x 35.8 cm. Alghind: 5500 € Haamrihind: 6000 €
Name: full, dtype: object


## Clean df, remove unwanted characters

In [97]:
df["end_price"] = df["end_price"].str.replace(":", "").str.replace("€", "")
df["start_price"] = df["start_price"].str.replace(":", "").str.replace("€", "")
size_splits = df["size"].str.split("[xX\/]", 2, expand=True)
df["height"] = size_splits[0].str.extract("(\d+(\.\d+)?)",  expand=True)[0]
df["width"] = size_splits[1].str.extract("(\d+(\.\d+)?)",  expand=True)[0]
df["year_start"] = df["year"]
df["date"] = df["img"].str.extract("((19|20)\d{2})", expand=True)[0]
df["auction_name"] = df["date"]



## Cleaning and extracting techniques

In [98]:
# extract individual techniques 
tech_prepared = df["tech"].str.lower().str.replace(".", "").str.split(",", expand=True)
for col in tech_prepared:
    tech_prepared[col] = tech_prepared[col].str.strip()
    
# prepare processing of multiple art piece techniques (one art piece can have multiple tech)
tech_df_list = []
for i in range(len(tech_prepared.columns)):
    tech_df_i = tech_prepared[i].str.get_dummies(sep=" ")
    tech_df_list.append(tech_df_i)

# process techniques (valid/invalid)
tech_uniq_cols = set()
columns_to_remove = ["x", "(", ")", "ï¿½li", "guaï¿½ï¿½", "akrüüllateks", "lï¿½uend"] + [str(x) for x in range(1900, 2022)]
for tdf in tech_df_list:
    print(tdf.shape)
    for col in tdf.columns:
        temp_uniq_size = len(tech_uniq_cols)
        if col.strip().isdigit() or len(col) < 3 or any(rem in col for rem in columns_to_remove):
            tdf.drop(col, inplace=True, axis=1)
        else:
            tech_uniq_cols.add(col)
            if len(tech_uniq_cols) == temp_uniq_size:
                tdf.drop(col, inplace=True, axis=1)
                
# combine tech columns
tech_df = pd.DataFrame(tech_df_list[0])
for tdf in range(1, len(tech_df_list)):
    print(tech_df_list[tdf].shape)
    tech_df = pd.concat([tech_df, tech_df_list[tdf]], axis=1, sort=True)
    

for col in tech_df.columns:
    print(col)

(211, 51)
(211, 27)
(211, 17)
(211, 5)
(211, 15)
(211, 5)
(211, 0)
-70
aastad
aastate
akrüül
akvarell
graafika
guašš
joonistus
kips
kriit
kuivnõel
linoollõige
monotüüpia
pastell
pliiats
pool
puit
seepia
serigraafia
süsi
teine
tempera
värviline
õli
õlimaal
email
heegeldatud
jämedatoimne
kartong
kollaaž
lõuend
masoniit
metall
paber
paks
papp
pealekleebitud
pits
segatehnika
vineer
alghind:
haamrihind:
kleebitud
papile
vineeril


## Simple test for check if data is clean

In [99]:
def test_data_clean(initial_df):
    test_df = pd.DataFrame(initial_df)
    print("###", "author", "###")
    for name in test_df["author"]:
        if pd.isnull(name) or "," not in name:
            pass # not that important
    pd.options.display.max_colwidth = 50

    # Number fields to numeric
    for col in ["start_price", "end_price", "height", "width", "year_start", "date"]:
        test_df[col] = pd.to_numeric(test_df[col], errors="coerce", downcast="integer")
        error_rows = test_df[test_df[col].isnull()]
        if col == "start_price":
            error_rows = error_rows.loc[error_rows["full"].str.count("€") > 0]
        if col == "end_price":
            error_rows = error_rows.loc[error_rows["full"].str.count("€") > 1]
        if col == "height" or col == "width":
            error_rows = error_rows.loc[error_rows["size_prices"].str.count("[xX\/]") == 1]
        if col == "year_start":
            error_rows = error_rows.loc[(error_rows["year"].notnull())]
#         if col == "date":
#             error_rows = error_rows.loc[(error_rows["auction_name"].str.count("((19|20)\d{2})") > 0)]
        # Print rows with problems
        print("###", col, "###")
        for index, val in error_rows.iterrows():
            print(val)
            print()

    return test_df
tested_df = test_data_clean(df)
tested_df.head(11)

### author ###
### start_price ###
author                                                   Ado Lill
end_price                                                  23000 
full            Ado Lill. Kontrastne. 1974. Õli, masoniit. 150...
href            https://vernissage.ee/toode/ado-lill-kontrastn...
img             https://vernissage.ee/wp-content/uploads/2021/...
size                                           150 x 120 cm . Alg
size_prices     [ 150 x 120 cm . Alg, : 12000 €  Haamrihind: 2...
start_price                                                   NaN
tech                                                Õli, masoniit
title                                                  Kontrastne
year                                                         1974
height                                                        150
width                                                         120
year_start                                                   1974
date                                     

Unnamed: 0,author,end_price,full,href,img,size,size_prices,start_price,tech,title,year,height,width,year_start,date,auction_name
0,Ado Lill,23000.0,"Ado Lill. Kontrastne. 1974. Õli, masoniit. 150...",https://vernissage.ee/toode/ado-lill-kontrastn...,https://vernissage.ee/wp-content/uploads/2021/...,150 x 120 cm . Alg,"[ 150 x 120 cm . Alg, : 12000 € Haamrihind: 2...",,"Õli, masoniit",Kontrastne,1974,150.0,120.0,1974.0,2021,2021
1,Agur Kruusing,3300.0,Agur Kruusing. Tennis on parem kui seks. 2004....,https://vernissage.ee/toode/agur-kruusing-tenn...,https://vernissage.ee/wp-content/uploads/2021/...,130 x 100 cm. Alg,"[ 130 x 100 cm. Alg, : 2500 € Haamrihind: 330...",,"Õli, lõuend",Tennis on parem kui seks,2004,130.0,100.0,2004.0,2021,2021
2,Aili Vint,18500.0,"Aili Vint. Öö merel. 1983. Õli, lõuend. 35 x 4...",https://vernissage.ee/toode/aili-vint-oo-merel...,https://vernissage.ee/wp-content/uploads/2021/...,35 x 45 cm. Alg,"[ 35 x 45 cm. Alg, : 9000 € Haamrihind: 18500 €]",,"Õli, lõuend",Öö merel,1983,35.0,45.0,1983.0,2021,2021
3,Aleksander Möldroo,900.0,Aleksander Möldroo. Vaade Rannavärava mäelt. 1...,https://vernissage.ee/toode/aleksander-moldroo...,https://vernissage.ee/wp-content/uploads/2021/...,Lm 71 x 50.4cm. Km 64.7 x 44.2 cm. Alg,"[ Lm 71 x 50.4cm. Km 64.7 x 44.2 cm. Alg, : 90...",,"Monotüüpia, paber",Vaade Rannavärava mäelt,1959,71.0,50.4,1959.0,2021,2021
4,Aleksander Vardi,6000.0,Aleksander Vardi. Itaalia komöödia. 1962. Mono...,https://vernissage.ee/toode/aleksander-vardi-i...,https://vernissage.ee/wp-content/uploads/2021/...,km 44.6 x 35.8 cm. Alg,"[ km 44.6 x 35.8 cm. Alg, : 5500 € Haamrihind:...",,"Monotüüpia, õli, paber",Itaalia komöödia,1962,44.6,35.8,1962.0,2021,2021
5,Andrei Jegorov,10500.0,"Andrei Jegorov. Kalurid kaldal. 1946. Õli, vi...",https://vernissage.ee/toode/andrei-jegorov-maa...,https://vernissage.ee/wp-content/uploads/2021/...,57.7 x 80.1 cm. Alg,"[ 57.7 x 80.1 cm. Alg, : 8500 € Haamrihind: 1...",,"Õli, vineer",Kalurid kaldal,1946,57.7,80.1,1946.0,2021,2021
6,Andres Tolts,,"Andres Tolts. Taevas I, 1996, akrüül, lõuend, ...",https://vernissage.ee/toode/andres-tolts-balan...,https://vernissage.ee/wp-content/uploads/2021/...,,,,,"Taevas I, 1996, akrüül, lõuend, 120 x 150 cm",Alghind: 5000 € Haamrihind: (Müüdud),,,,2021,2021
7,Andrus Kasemaa,2800.0,"Andrus Kasemaa. Põimuvad kehad. 1992. Pastell,...",https://vernissage.ee/toode/andrus-kasemaa-poi...,https://vernissage.ee/wp-content/uploads/2021/...,"Vm 55 x 80,3cm Alg","[ Vm 55 x 80,3cm Alg, : 2800 € Haamrihind: 2...",,"Pastell, süsi, paber",Põimuvad kehad,1992,55.0,80.0,1992.0,2021,2021
8,Ants Erik Vomm,1200.0,Ants Erik Vomm. Jaapani tantsitar II. U.1970. ...,https://vernissage.ee/toode/ants-erik-vomm-jaa...,https://vernissage.ee/wp-content/uploads/2021/...,"Õli, papp. 100 x 77.5 cm. Alg","[ Õli, papp. 100 x 77.5 cm. Alg, : 1200 € Haam...",,1970,Jaapani tantsitar II,U,100.0,77.5,,2021,2021
9,Ants Murakin,1500.0,Ants Murakin. Rukkihakid. 1940.aastate lõpp-19...,https://vernissage.ee/toode/ants-murakin-rukki...,https://vernissage.ee/wp-content/uploads/2021/...,"aastad. Õli, masoniit. 25.4 x 30.3 cm. Alg","[aastad. Õli, masoniit. 25.4 x 30.3 cm. Alg, :...",,aastate lõpp-1950,Rukkihakid,1940,25.4,30.3,1940.0,2021,2021


## Construct final DataFrame

In [101]:
clean_df = pd.DataFrame()
clean_df["url"] = tested_df["href"]
clean_df["src"] = ["vern"] * len(clean_df)
for col in ["auction_name", "date", "title", "author", "start_price", "end_price", "year", "year_start", "tech", "size", "height", "width", "img"]: 
    clean_df[col] = tested_df[col]
    
for col in tech_df.columns:
    clean_df[col] = tech_df[col].fillna(0)
print(clean_df.shape)
clean_df.head()

Index(['author', 'end_price', 'full', 'href', 'img', 'size', 'size_prices',
       'start_price', 'tech', 'title', 'year', 'height', 'width', 'year_start',
       'date', 'auction_name'],
      dtype='object')
(211, 60)


Unnamed: 0,url,src,auction_name,date,title,author,start_price,end_price,year,year_start,...,papp,pealekleebitud,pits,segatehnika,vineer,alghind:,haamrihind:,kleebitud,papile,vineeril
0,https://vernissage.ee/toode/ado-lill-kontrastn...,vern,2021,2021,Kontrastne,Ado Lill,,23000.0,1974,1974.0,...,0,0,0,0,0,0,0,0,0,0
1,https://vernissage.ee/toode/agur-kruusing-tenn...,vern,2021,2021,Tennis on parem kui seks,Agur Kruusing,,3300.0,2004,2004.0,...,0,0,0,0,0,0,0,0,0,0
2,https://vernissage.ee/toode/aili-vint-oo-merel...,vern,2021,2021,Öö merel,Aili Vint,,18500.0,1983,1983.0,...,0,0,0,0,0,0,0,0,0,0
3,https://vernissage.ee/toode/aleksander-moldroo...,vern,2021,2021,Vaade Rannavärava mäelt,Aleksander Möldroo,,900.0,1959,1959.0,...,0,0,0,0,0,0,0,0,0,0
4,https://vernissage.ee/toode/aleksander-vardi-i...,vern,2021,2021,Itaalia komöödia,Aleksander Vardi,,6000.0,1962,1962.0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#TODO: Some rows incorrect
#TODO: Get auction name

## Connect and get existing database DataFrame, merge DataFrames

In [103]:
!pip install mysql-connector-python
import sqlalchemy
import mysql.connector as mysql
#from pyodbc import ProgrammingError
import getpass

db_username = 'admin'
db_password = getpass.getpass()
db_ip       = 'kanvas-auctions.cxljcprf9rvb.us-east-1.rds.amazonaws.com'
db_name     = 'kanvas'
db_connection = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}@{2}/{3}'.
                                               format(db_username, db_password, 
                                                      db_ip, db_name))
print(db_connection)
try:
    raise mysql.errors.ProgrammingError()
    #clean_df.to_sql(con=db_connection, name='art', if_exists='append')
except mysql.errors.Error:
    pass
db_data = pd.read_sql('SELECT * FROM test_5', db_connection)
merged_df = pd.concat([db_data,clean_df], sort=False)
print(clean_df.shape, db_data.shape, merged_df.shape)
merged_df

········
Engine(mysql+mysqlconnector://admin:***@kanvas-auctions.cxljcprf9rvb.us-east-1.rds.amazonaws.com/kanvas)


Unnamed: 0,url,src,auction_name,date,title,author,start_price,end_price,year,year_start,...,õlimaal,email,heegeldatud,jämedatoimne,metall,paks,pealekleebitud,pits,alghind:,haamrihind:
0,?c=teosed&l=et&id=2512&window=1&oid=3&form=0,haus,20.11.1997 14:01-20.11.1997 00:00,1997.0,NAINE PAABULINNUGA,Evald Okas,831.0,844.0,1939,1939.0,...,,,,,,,,,,
1,?c=teosed&l=et&id=2433&window=1&oid=3&form=0,haus,20.11.1997 14:01-20.11.1997 00:00,1997.0,TALLINNA RAEKOJA PLATS,Karl Burman (seenior),499.0,499.0,1920,1920.0,...,,,,,,,,,,
2,?c=teosed&l=et&id=2489&window=1&oid=5&form=0,haus,26.10.1998 14:16,1998.0,NATÜÜRMORT SIDRUNITEGA,Valerian Loik,1074.0,1074.0,1970,1970.0,...,,,,,,,,,,
3,?c=teosed&l=et&id=2509&window=1&oid=5&form=0,haus,26.10.1998 14:16,1998.0,MAASTIK TALUMAJAGA,Roman Nyman,1176.0,1176.0,1940,1940.0,...,,,,,,,,,,
4,?c=teosed&l=et&id=2568&window=1&oid=5&form=0,haus,26.10.1998 14:16,1998.0,Puulõike sünd,Eduard Wiiralt,230.0,556.0,1936,1936.0,...,,,,,,,,,,
5,?c=teosed&l=et&id=2428&window=1&oid=5&form=0,haus,26.10.1998 14:16,1998.0,ILLUSTRATSIOONID M. RAUA “VALITUD LUULETUSTELE,Aino Bach,920.0,920.0,1946,1946.0,...,,,,,,,,,,
6,?c=teosed&l=et&id=2429&window=1&oid=5&form=0,haus,26.10.1998 14:16,1998.0,LAPSED,Aino Bach,831.0,831.0,1950,1950.0,...,,,,,,,,,,
7,?c=teosed&l=et&id=2477&window=1&oid=5&form=0,haus,26.10.1998 14:16,1998.0,SUVI,Eduard Kutsar,1150.0,1150.0,1950,1950.0,...,,,,,,,,,,
8,?c=teosed&l=et&id=2530&window=1&oid=5&form=0,haus,26.10.1998 14:16,1998.0,LAMAV AKT,Richard Sagrits,575.0,671.0,1958,1958.0,...,,,,,,,,,,
9,?c=teosed&l=et&id=2523&window=1&oid=5&form=0,haus,26.10.1998 14:16,1998.0,LILLED AKNAL,Oskar Raunam,1470.0,1643.0,1941,1941.0,...,,,,,,,,,,


## Continue merging DataFrames, change conflicting word forms

In [106]:
last_non_tech = list(merged_df.columns).index("img")
print("### starting after", merged_df.columns[last_non_tech], "###")

# map tech columns that cause duplicate columns (db converts š->s, ž->z and so on)
mappings = {"guašš":["guašš", "guaśś", "guašs"], "kollaaž":["kollaaž", "kollaaź"], "tušš":["tušš", "tušs", "tuśś", "tuss"]}
for k, val in mappings.items():
    print(k, val)
    remaining_val = []
    for v in val:
        if v in merged_df.columns:
            remaining_val.append(v)
    if len(remaining_val) == 0:
        continue
        
    merged_df[k] = merged_df[remaining_val].max()
    for v in remaining_val:
        print(v)
        if v == k:
            continue
        merged_df.drop(v, axis=1, inplace=True)
print(list(merged_df.columns[:]))
for col in merged_df.columns[last_non_tech + 1:]:
    merged_df[col] = merged_df[col].fillna(0)
#merged_df = merged_df.loc[:,~tech_df.columns.duplicated()]
print(clean_df.shape, db_data.shape, merged_df.shape)


### starting after img ###
guašš ['guašš', 'guaśś', 'guašs']
guašš
kollaaž ['kollaaž', 'kollaaź']
kollaaž
tušš ['tušš', 'tušs', 'tuśś', 'tuss']
tušš
['url', 'src', 'auction_name', 'date', 'title', 'author', 'start_price', 'end_price', 'year', 'year_start', 'year_end', 'tech', 'size', 'height', 'width', 'img', 'akrüül', 'akvarell', 'akvatinta', 'autolito', 'autoritehnika', 'awagami', 'diatüüpia', 'digiprint', 'digitaalne', 'digitrükk', 'foto', 'giclee', 'graafika', 'grafiit', 'guašš', 'hanga', 'itaalia', 'joonistus', 'kartongtrükk', 'kastitaolises', 'keraamika', 'kips', 'kipsvaland', 'kivilito', 'kleebitud', 'kollaaž', 'koloreeritud', 'kriidiga', 'kriit', 'kuivnõel', 'käärilõige', 'kõrgtrükk', 'lametrükk', 'linool', 'linoolgravüür', 'linooll', 'linoollõige', 'linoolsügavtrükk', 'lito', 'litograafia', 'lõuend', 'lõuendil', 'made', 'marker', 'metsotinto', 'moku', 'monotüüpia', 'must', 'ofort', 'ofset-lito', 'oksüdograafia', 'paber', 'papile', 'papp', 'pastapliiats', 'all', 'betoon', 'cath

## Commit changes by overwriting table

In [None]:
merged_df.to_sql(con=db_connection, name="test_6", if_exists = 'replace', index=False)