# Data Cleaning

> In this notebook we will be blah blah blah

---

## Imports

In [1]:
import numpy as np
import pandas as pd

import langid
import requests
import googletrans
import copy
import urllib
import os
import io
import sys
import re

if sys.version_info < (3, 0):
    from urllib2 import urlopen
else:
    from urllib.request import urlopen

from googletrans import Translator
from colorthief import ColorThief

pd.set_option('max_colwidth', 100)

---

## Reading in Data

In [133]:
df = pd.read_csv('../data/wikiart_scraped.csv')
df

Unnamed: 0,Style,Artwork,Artist,Date,Link
0,Early-Dynastic,Narmer Palette,Ancient Egypt,3050 BC,https://uploads3.wikiart.org/00265/images/ancient-egyptian-painting/narmer-palette.jpg
1,Early-Dynastic,Box Inlay with a Geometric Pattern,Ancient Egypt,3100-2900 BC,https://uploads2.wikiart.org/00244/images/ancient-egyptian-painting/box-inlay-with-a-geometric-p...
2,Old-Kingdom,Khafre Enthroned,Ancient Egypt,2570 BC,https://uploads2.wikiart.org/00305/images/ancient-egyptian-painting/330px-khafre-statue.jpg
3,Middle-Kingdom,Stele of the Serpent King (Stela of Djet),Ancient Egypt,3000 BC,https://uploads7.wikiart.org/00305/images/ancient-egyptian-painting/louvre-stele-roi-serpent.jpg
4,Middle-Kingdom,"Laden Donkeys and Ploughing, Tomb of Djar",Ancient Egypt,2060-2010 BC,https://uploads8.wikiart.org/00244/images/ancient-egyptian-painting/laden-donkeys-and-ploughing-...
...,...,...,...,...,...
124165,Street-Photography,Portrait of the corn stalk,Alfred Freddy Krupa,2019,https://uploads5.wikiart.org/00241/images/alfred-freddy-krupa/wp-20190217-004.jpg
124166,Street-Photography,The other side of life,Alfred Freddy Krupa,2019,https://uploads7.wikiart.org/00241/images/alfred-freddy-krupa/wp-20190620-004-1.jpg
124167,Street-Photography,The bonfire during construction,Alfred Freddy Krupa,2019,https://uploads7.wikiart.org/00242/images/alfred-freddy-krupa/wp-20190618-002-1.jpg
124168,Street-Photography,Limpidity,Alfred Freddy Krupa,2019,https://uploads7.wikiart.org/00248/images/alfred-freddy-krupa/wp-20190805-002-1.jpg


## Cleaning Data Function

> This function will clean the data by step1, step2, step3. It takes in such and such argument and returns such and such value.

In [134]:
# Function that takes in Dataframe, returns numeric values for dates
def to_date(dataframe):

    #This converts all roman numbers to century
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XIX-XX cent.'], value='1800')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XX cent.'],value='1900')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XVIII cent.'],value='1700')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XIX cent.'],value='1800')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XX-XXI cent.'],value='1900')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XVI-XVII cent.'],value='1500')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XV-XVI cent.'],value='1400')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XVII-XVIII cent.'],value='1600')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XVI cent.'],value='1500')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XV cent.'],value='1400')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XIV-XV cent.'],value='1300')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XVII cent.'],value='1600')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XII-XIII cent.'],value='1100')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['X cent.'],value='900')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XIII-XIV cent.'],value='1200')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['VIII cent.'],value='700')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['VII-VIII cent.'],value='600')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XIV cent.'],value='1300')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XI cent.'],value='1000')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XI-XII cent.'],value='1000')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XXI cent.'],value='2000')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XIX-XX cent.'],value='1800')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['XVIII-XIX cent.'],value='1800')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['47'], value='1447')
    dataframe['Date'] = dataframe['Date'].replace(to_replace=['48'],value='1448')

    ## Hyphen remover
    for index, row in dataframe.iterrows():
        date_range = row['Date']
        for c in date_range:
            if c == '-':
                val = date_range[:date_range.index(c)]
                dataframe.at[index, 'Date'] = (val)
                break

    #This converts all date's to numeric values
    dataframe['Date'] = dataframe['Date'].astype(int)

    return dataframe

In [135]:
def cleaned_data(dataframe):
    """
    cleaned_data is a function that does blah blah blah and returns blah blah blah
    
    args: 
        dataframe: the dataframe that the user wants to clean
    """
    print('Before Cleaning')
    print('='*20)
    print('Columns')
    print(dataframe.columns)
    print('_'*20)
    print('Dataframe Size')
    print(dataframe.shape)
    print('_'*20)
    print('Dataframe Unique Values')
    print(dataframe.nunique())
    print('_'*20)
    print('Null Values in Each Column')
    print(dataframe.isna().sum())
    print('_'*20)
    print('Data Types in Each Column')
    print(dataframe.dtypes)
    print('='*20)
    
    #This drops cuplicates of any work of art to decrease chances of sketches
    dataframe = dataframe.drop_duplicates(subset=['Artwork', 'Artist', 'Date'], 
                       keep='last')
    
    #This drops any values in the style columns that have less than a count of 500
    #So the model can properly train on the style
    s = dataframe['Style'].value_counts() > 500
    s = list(s[s == True].index)
    dataframe = dataframe[dataframe['Style'].isin(s)]
    
    dataframe = to_date(dataframe)
    
    print('After Cleaning')
    print('='*20)
    print('Columns')
    print(dataframe.columns)
    print('_'*20)
    print('Dataframe Size')
    print(dataframe.shape)
    print('_'*20)
    print('Dataframe Unique Values')
    print(dataframe.nunique())
    print('_'*20)
    print('Null Values in Each Column')
    print(dataframe.isna().sum())
    print('_'*20)
    print('Data Types in Each Column')
    print(dataframe.dtypes)
    print('='*20)
    
    dataframe.loc[dataframe['Style'].str.lower().str.contains('renaissance'), 'Style'] = 'renaissance'
    dataframe.loc[dataframe['Style'].str.lower().str.contains('impressionism'), 'Style'] = 'impressionism'
    dataframe.loc[dataframe['Style'].str.lower().str.contains('romanticism'), 'Style'] = 'romanticism'
    dataframe.loc[dataframe['Style'].str.contains('Realism'), 'Style'] = 'realism'
    dataframe.loc[dataframe['Style'].str.lower().str.contains('expressionism'), 'Style'] = 'expressionism'
    dataframe.loc[dataframe['Style'].str.lower().str.contains('impressionism'), 'Style'] = 'impressionism'
    dataframe.loc[dataframe['Style'].str.lower().str.contains('abstract'), 'Style'] = 'abstract'
    dataframe.loc[dataframe['Style'].str.lower().str.contains('modern'), 'Style'] = 'modern'
    dataframe.loc[dataframe['Style'].str.lower().str.contains('primitivism'), 'Style'] = 'primitivism'
    
    s = dataframe['Style'].value_counts() > 1000
    s = list(s[s == True].index)
    dataframe = dataframe[dataframe['Style'].isin(s)]
    
    #returning the dataframe with an index reset
    return dataframe.reset_index(drop=True)

In [136]:
def drop_dead_rows(dataframe):

    bad_URLs = [
    'https://uploads2.wikiart.org/images/henri-rousseau/view-of-the-bridge-at-sevres-and-the-hills-at-clamart-st-cloud-and-bellevue-1908.jpg',
    'https://uploads8.wikiart.org/images/jean-arp/abstract-composition.jpg',
    'https://uploads2.wikiart.org/images/franz-marc/sleeping-animals-1913.jpg',
    'https://uploads5.wikiart.org/images/el-lissitzky/central-park-of-culture-and-leisure-sparrow-hills.jpg',
    'https://uploads1.wikiart.org/images/juan-gris/glass-and-carafe-1917.jpg',
    'https://uploads6.wikiart.org/images/juan-gris/landscape-at-beaulieu-1918.jpg',
    'https://uploads8.wikiart.org/images/pablo-picasso/untitled-1920-2.jpg',
    'https://uploads0.wikiart.org/images/juan-gris/the-open-window-1921.jpg',
    'https://uploads0.wikiart.org/images/georgia-o-keeffe/special-no-32.jpg'
    ]
    
    dataframe = dataframe.drop(dataframe[dataframe['Link'].isin(bad_URLs)].index)

    return dataframe.reset_index(drop=True)

df = drop_dead_rows(df)

In [137]:
clean_df = drop_dead_rows(df)

In [138]:
clean_df = cleaned_data(df)

Before Cleaning
Columns
Index(['Style', 'Artwork', 'Artist', 'Date', 'Link'], dtype='object')
____________________
Dataframe Size
(124161, 5)
____________________
Dataframe Unique Values
Style         217
Artwork     92044
Artist       3052
Date         2959
Link       116658
dtype: int64
____________________
Null Values in Each Column
Style      0
Artwork    0
Artist     0
Date       0
Link       0
dtype: int64
____________________
Data Types in Each Column
Style      object
Artwork    object
Artist     object
Date       object
Link       object
dtype: object
After Cleaning
Columns
Index(['Style', 'Artwork', 'Artist', 'Date', 'Link'], dtype='object')
____________________
Dataframe Size
(89686, 5)
____________________
Dataframe Unique Values
Style         51
Artwork    76170
Artist      2632
Date         621
Link       89638
dtype: int64
____________________
Null Values in Each Column
Style      0
Artwork    0
Artist     0
Date       0
Link       0
dtype: int64
____________________
Dat

In [140]:
clean_df['Style'].value_counts()

renaissance             8543
realism                 8043
impressionism           7908
expressionism           7783
romanticism             4256
Baroque                 3312
Rococo                  3298
Symbolism               3172
Neoclassicism           3141
Cubism                  3097
modern                  3066
primitivism             3043
Surrealism              2960
abstract                2563
Academicism             2438
Pop-Art                 2361
Minimalism              2020
Ukiyo-e                 1719
Art-Informel            1648
Conceptual-Art          1609
Color-Field-Painting    1364
Orientalism             1324
Op-Art                  1105
Name: Style, dtype: int64

Code is clean!

In [84]:
def lang_column(df):
    df['Language'] = None
    for i in range(0, len(df['Artwork'])):
        df['Language'][i] = langid.classify(df['Artwork'][i])[0]    
    return df

In [85]:
clean_df = lang_column(clean_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Language'][i] = langid.classify(df['Artwork'][i])[0]


KeyboardInterrupt: 

In [None]:
clean_df.to_csv('../data/clean_art_lang.csv', index = False)

In [None]:
# Add columns of translated text
def trans_column(clean_df):
    translator = Translator()
    clean_df['Artwork'] = clean_df['Artwork'].astype(str)
    clean_df['translated'] = clean_df.loc[clean_df.Language != 'en']['Artwork'].apply(translator.translate, 
                                                                                  src='auto', 
                                                                                  dest='en').apply(getattr, 
                                                                                                   args=('text',))
    return clean_df

In [None]:
def split_column_for_trans(clean_df):
    clean_df_not_en = clean_df.loc[clean_df.Language != 'en']
    bruh = int(clean_df_not_en.shape[0]/10)
    clean_df1 = clean_df_not_en.iloc[:bruh]
    clean_df2 = clean_df_not_en.iloc[bruh+1:bruh*2]
    clean_df3 = clean_df_not_en.iloc[(bruh*2)+1:bruh*3]
    clean_df4 = clean_df_not_en.iloc[(bruh*3)+1:bruh*4]
    clean_df5 = clean_df_not_en.iloc[(bruh*4)+1:bruh*5]
    clean_df6 = clean_df_not_en.iloc[(bruh*5)+1:bruh*6]
    clean_df7 = clean_df_not_en.iloc[(bruh*6)+1:bruh*7]
    clean_df8 = clean_df_not_en.iloc[(bruh*7)+1:bruh*8]
    clean_df9 = clean_df_not_en.iloc[(bruh*8)+1:bruh*9]
    clean_df10 = clean_df_not_en.iloc[(bruh*9)+1:]
    new_df = pd.concat([clean_df1, clean_df2, clean_df3, clean_df4, clean_df5, clean_df6, clean_df7, clean_df8, clean_df9, clean_df10])
    return new_df

In [None]:
clean_df1 = trans_column(clean_df1)

In [None]:
clean_df2 = trans_column(clean_df2)

In [None]:
clean_df3 = trans_column(clean_df3)

In [None]:
clean_df4 = trans_column(clean_df4)

In [None]:
clean_df5 = trans_column(clean_df5)

In [None]:
clean_df6 = trans_column(clean_df6)

In [None]:
clean_df7 = trans_column(clean_df7)

In [None]:
clean_df8 = trans_column(clean_df8)

In [None]:
clean_df9 = trans_column(clean_df9)

In [None]:
clean_df10 = trans_column(clean_df10)

In [None]:
new_df = pd.concat([clean_df1, clean_df2, clean_df3, clean_df4, clean_df5, clean_df6, clean_df7, clean_df8, clean_df9, clean_df10])

In [None]:
clean_df_new = clean_df.loc[clean_df.Language == 'en']

In [None]:
clean_df_new['translated'] = clean_df_new['Artwork']

In [None]:
new_df2 = pd.concat([new_df, clean_df_new])

In [None]:
new_df2.shape

In [None]:
new_df2 = new_df2.drop_duplicates(subset=['Link'])

In [185]:
new_df2.to_csv('../data/clean_art_trans.csv', index = False)

In [2]:
new_df2

Unnamed: 0,Style,Artwork,Artist,Date,Link,Language,translated,dom_color,rgbl
0,Early-Renaissance,Virgen Con El Niño,Álvaro Pires de Évora,1447,https://uploads6.wikiart.org/00312/images/alvaro-pires-de-evora/alvaro-pirez-pisa.jpg,es,Madonna and Child,,
1,Early-Renaissance,A Virgem em Majestade,Álvaro Pires de Évora,1448,https://uploads3.wikiart.org/00312/images/alvaro-pires-de-evora/alvaro-pirez-d-vora-a-virgem-em-...,pt,The Virgin in Majesty,,
2,Early-Renaissance,Christus Am Ölberg Und Die Marien Am Grabe,Lorenzo Monaco,1408,https://uploads0.wikiart.org/00207/images/lorenzo-monaco/christus-am-lberg-und-die-marien-am-gra...,de,Christ on the Mount of Olives and Mary at the Tomb,,
3,Early-Renaissance,San Giovanni Evangelista,Donatello,1408,https://uploads8.wikiart.org/00145/images/donatello/800px-sangiovannievangelista.jpg,nl,St. John the Evangelist,,
4,Early-Renaissance,Annunciation Triptych,Lorenzo Monaco,1410,https://uploads2.wikiart.org/00206/images/lorenzo-monaco/annunciation.jpg,es,Annunciation Triptych,,
...,...,...,...,...,...,...,...,...,...
89624,Ukiyo-e,Lucy's Mutation,Takato Yamamoto,2014,https://uploads3.wikiart.org/00147/images/takato-yamamoto/larger-1.jpg,en,Lucy's Mutation,,
89625,Ukiyo-e,Common Sense,Takato Yamamoto,2014,https://uploads3.wikiart.org/00147/images/takato-yamamoto/larger-9.jpg,en,Common Sense,,
89626,Ukiyo-e,Sacred Circulation,Takato Yamamoto,2015,https://uploads3.wikiart.org/00147/images/takato-yamamoto/larger-2.jpg,en,Sacred Circulation,,
89627,Ukiyo-e,Ecstasy of Linked Circles,Takato Yamamoto,2015,https://uploads3.wikiart.org/00147/images/takato-yamamoto/larger-8.jpg,en,Ecstasy of Linked Circles,,


In [6]:
from __future__ import print_function
import binascii
import struct
from PIL import Image
import scipy
import scipy.misc
import scipy.cluster
from scipy.spatial import KDTree
from webcolors import CSS3_HEX_TO_NAMES, hex_to_rgb

def dom_color(image):
    # for loop that grabs each image
    NUM_CLUSTERS = 5

    # reading image
    im = Image.open('../images/' + image, mode='r')
    im = im.resize((150, 150))      # optional, to reduce time
    ar = np.asarray(im)
    shape = ar.shape
    ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)

    # finding clusters
    codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
    # print('Top 5 Colors:\n', codes)

    vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
    counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences

    index_max = scipy.argmax(counts)                    # find most frequent
    peak = codes[index_max]
    colour = binascii.hexlify(bytearray(int(c) for c in peak)).decode('ascii')
    
    # print('Most dominant color: %s (#%s)' % (peak, colour))
    # print(colour)
    
    
    peak = tuple(peak)
    
    # returns top color
    def convert_rgb_to_names(peak):
    
        # a dictionary of all the hex and their respective names in css3
        css3_db = CSS3_HEX_TO_NAMES
        names = []
        rgb_values = []
        for color_hex, color_name in css3_db.items():
            names.append(color_name)
            rgb_values.append(hex_to_rgb(color_hex))
    
        kdt_db = KDTree(rgb_values)
        distance, index = kdt_db.query(peak)
        
        top_color = names[index]
        return top_color
    
    
             
    return colour, convert_rgb_to_names(peak)

# source: https://medium.com/codex/rgb-to-color-names-in-python-the-robust-way-ec4a9d97a01f


In [7]:
%%time

dom_color('../images/image_0.jpg')

CPU times: total: 141 ms
Wall time: 149 ms


  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


('1d1c26', 'black')

In [151]:
list_imgs = os.listdir('../images/')
new_list_imgs = ["../images/" + f for f in list_imgs]

In [152]:
# split up image folder into 20. 
chunks = [new_list_imgs[x:x+4500] for x in range(0, len(new_list_imgs), 4500)]

In [153]:
def iter_colors(chunk):
    # Run function for every image based on 4500 chunks
    res0 = [dom_color(i) for i in chunks[chunk]]

    # Create DataFrame
    test = pd.DataFrame(res0, columns=['Hex', 'Color'])

    # Identifies which number the image is
    num_list = []
    for images in chunks[chunk]:
        num = ""
        for c in images:
            if c.isdigit():
                num = num + c
        num_list.append(int(num))
    # Appends num_list values to dataframe
    test['Image'] = num_list
    
    return test

In [154]:
chunk0 = iter_colors(0)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [155]:
chunk1 = iter_colors(1)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [156]:
chunk2 = iter_colors(2)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [157]:
chunk3 = iter_colors(3)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [158]:
chunk4 = iter_colors(4)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [159]:
chunk5 = iter_colors(5)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [160]:
chunk6 = iter_colors(6)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [161]:
chunk7 = iter_colors(7)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [162]:
chunk8 = iter_colors(8)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [163]:
chunk9 = iter_colors(9)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [164]:
chunk10 = iter_colors(10)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [165]:
chunk11 = iter_colors(11)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [166]:
chunk12 = iter_colors(12)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [167]:
chunk13 = iter_colors(13)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [168]:
chunk14 = iter_colors(14)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [169]:
chunk15 = iter_colors(15)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [170]:
chunk16 = iter_colors(16)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [171]:
chunk17 = iter_colors(17)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [172]:
chunk18 = iter_colors(18)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [173]:
chunk19 = iter_colors(19)

  ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)
  counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences
  index_max = scipy.argmax(counts)                    # find most frequent


In [218]:
chunky_df = pd.concat([chunk0, 
                      chunk1,
                      chunk2,
                      chunk3,
                      chunk4,
                      chunk5,
                      chunk6,
                      chunk7,
                      chunk8,
                      chunk9,
                      chunk10,
                      chunk11,
                      chunk12,
                      chunk13,
                      chunk14,
                      chunk15,
                      chunk16,
                      chunk17,
                       chunk18,
                       chunk19])

In [219]:
chunky_df = chunky_df.set_index('Image')

In [220]:
chunky_df.sort_index(inplace = True)

In [221]:
brand_new_df = clean_df.join(chunky_df, how='left')
brand_new_df.dropna(inplace=True)

In [8]:
clean_df['hex'] = None
clean_df['color'] = None

# iterate over files in that directory
def iter_files(chunks):
    for images in chunks:
        print(images)
        num = ""
        for c in images:
            if c.isdigit():
                num = num + c
        num = int(num)
        # applies dominant color function
        dom_color(images)    
        # assigns dominant color hex to new column based on index
        clean_df['hex'][num] = dom_color(images)[0]
        clean_df['color'][num] = dom_color(images)[1]
        print(dom_color(images))
        print('---' *5)
    return clean_df

In [3]:
color_df = pd.read_csv('../data/color_column.csv')
trans_df = pd.read_csv('../data/clean_art_trans.csv')
dfinal = color_df.merge(trans_df, on="Link", how = 'inner')
dfinal = dfinal.drop(columns=['Style_x', 'Artwork_x', 'Artist_x', 'Date_x', 'dom_color', 'rgbl'], inplace=True)
dfinal.columns = ['link', 'hex', 'color', 'style', 'artwork', 'artist', 'date', 'language', 'translated']
dfinal = dfinal[['artwork', 'style', 'artist', 'date', 'link', 'hex', 'color', 'language', 'translated']]
dfinal.to_csv('../data/final_df.csv', index = False)

## Saving the Cleaned Dataframe

> We save the data as blah blah so that in later notebooks it can be used for eda and modeling prurposes much easier.

In [16]:
clean_df.to_csv('../data/clean_art.csv', index = False)

In [223]:
brand_new_df.to_csv('../data/color_column.csv', index = False)