# Color Names

Dataset from: https://en.wikipedia.org/wiki/List_of_colors_(compact)

## Parse data

In [1]:
import pandas as pd
import re
from pathlib import Path

In [2]:
raw_text_lines = Path('./wiki_list_of_colors_compact.txt').open('r', encoding='utf-8').read().splitlines()

In [3]:
re_link = re.compile(r"\[\[.+]\]")
re_parenthesis = re.compile(r"(\(.+\)\s*)+")
re_gray = re.compile(r"\bgrey\b")

In [4]:
def parse_row (text):
    record = dict()
    re_result = re_link.search(text)
    if re_result:
        match_text = re_result.group()
        t = match_text.strip('[]')
        if '|' not in t:
            label = t
        else:
            link, label = t.split('|')
            record['link'] = link
        text = text.replace(match_text, label)

    attrs = [t.strip() for t in text.strip('{}').split('|')]
    for attr in attrs:
        if '=' in attr:
            key, value = attr.split('=')
            value = re_gray.sub('gray', value)
            if key == 'name':
                re_result = re_parenthesis.search(value)
                if re_result:
                    match_text = re_result.group()
                    value = value.replace(match_text, '').strip()
                    record['tag'] = match_text
            record[key.lower().strip()] = value.lower().strip()

    return record

In [5]:
table = [parse_row(l) for l in raw_text_lines]
df_color = pd.DataFrame.from_records(table, columns=['name','hex','r','g','b','h','s','v'])
df_color.drop_duplicates('name', inplace=True)
df_color.describe()

Unnamed: 0,name,hex,r,g,b,h,s,v
count,1370,1370,1370,1370,1370,1370,1370,1370
unique,1370,1249,242,249,250,312,105,97
top,antique white,967117,255,0,0,0,100,100
freq,1,5,156,69,99,59,243,235


In [6]:
df_color.head()

Unnamed: 0,name,hex,r,g,b,h,s,v
0,absolute zero,0048ba,0,72,186,217,100,73
1,acid green,b0bf1a,176,191,26,65,86,75
2,aero,7cb9e8,124,185,232,206,47,91
3,aero blue,c9ffe5,201,255,229,151,21,100
4,african violet,b284be,178,132,190,288,31,75


In [7]:
df_color.to_csv('./color_names.csv', index=False)

## Find out main colors

In [8]:
se_tokens = df_color['name'].str.split(' ')

In [9]:
se_tokens.head()

0     [absolute, zero]
1        [acid, green]
2               [aero]
3         [aero, blue]
4    [african, violet]
Name: name, dtype: object

In [10]:
se_lead_color = pd.Series([r[-1] for r in se_tokens])

In [11]:
se_main_color = se_lead_color.value_counts().head((se_lead_color.value_counts() > 5).sum())

In [12]:
se_main_color

blue         131
green        111
pink          75
red           67
yellow        49
orange        36
purple        35
brown         33
gray          25
violet        21
rose          21
gold          14
lavender      13
silver        12
magenta       11
taupe         10
white         10
crimson        9
carmine        8
lime           8
black          8
orchid         7
fuchsia        7
cyan           7
bronze         7
turquoise      6
raspberry      6
indigo         6
azure          6
maroon         6
plum           6
dtype: int64