GIST Playground

In [17]:
%cd ..
import os
import glob
import gist
import tqdm
import json
from tqdm.notebook import tqdm

import numpy as np
import matplotlib.image as mpimg

/Users/kx/Docs/github/git_chinese_calligraphy-recognition


## Clustering

In [2]:
# read data
data_dir = 'data/clustering'
images = {}

for file in glob.glob(data_dir + '/*/*/*'):
    _, _, char, group, _ = file.split('/')
    img = mpimg.imread(file)
    if char not in images:
        images[char] = {}
    if group not in images[char]:
        images[char][group] = []
    images[char][group].append(img)
    
    
def estimate_features(images, extractor):

    features = {}
    for char in images:
        features[char] = {}
        for var in images[char]:
            var_fs = [extractor(image) for image in images[char][var]]
            features[char][var] = list(sum(var_fs) / len(var_fs))
            
    return features

In [7]:
def gist_extractor(image):
    if len(image.shape) == 2:
        image = np.expand_dims(image, 3).repeat(3, axis=2)
        
    nblocks = 4
    ops = (8, 8, 4)
    return gist.extract(
        image.astype('uint8'), 
        nblocks=nblocks, 
        orientations_per_scale=ops)[:nblocks * sum(ops)]
    

In [8]:
gists = estimate_features(images, gist_extractor)
str(gists)

'{}'

## Clean

In [14]:
# read data
data_dir = 'data/shufadict/clean'
ext_data_dir = 'data/hanwen360/clean'
images = {}

for file in glob.glob(data_dir + '/*/*.png') + (glob.glob(ext_data_dir + '/*/*.png')):
    char = file.split('/')[-2]
    img = mpimg.imread(file)
    if char not in images:
        images[char] = []
    images[char].append(img)

In [15]:
img_gists = {char: [str(list(gist_extractor(img))) for img in tqdm(images[char], desc=str(i))] for i, char in enumerate(images)}

0:   0%|          | 0/96 [00:00<?, ?it/s]

1:   0%|          | 0/96 [00:00<?, ?it/s]

2:   0%|          | 0/96 [00:00<?, ?it/s]

3:   0%|          | 0/96 [00:00<?, ?it/s]

4:   0%|          | 0/96 [00:00<?, ?it/s]

5:   0%|          | 0/96 [00:00<?, ?it/s]

6:   0%|          | 0/96 [00:00<?, ?it/s]

7:   0%|          | 0/96 [00:00<?, ?it/s]

8:   0%|          | 0/96 [00:00<?, ?it/s]

9:   0%|          | 0/96 [00:00<?, ?it/s]

10:   0%|          | 0/96 [00:00<?, ?it/s]

11:   0%|          | 0/96 [00:00<?, ?it/s]

12:   0%|          | 0/104 [00:00<?, ?it/s]

13:   0%|          | 0/96 [00:00<?, ?it/s]

14:   0%|          | 0/80 [00:00<?, ?it/s]

15:   0%|          | 0/96 [00:00<?, ?it/s]

16:   0%|          | 0/96 [00:00<?, ?it/s]

17:   0%|          | 0/96 [00:00<?, ?it/s]

18:   0%|          | 0/172 [00:00<?, ?it/s]

19:   0%|          | 0/96 [00:00<?, ?it/s]

20:   0%|          | 0/96 [00:00<?, ?it/s]

21:   0%|          | 0/96 [00:00<?, ?it/s]

22:   0%|          | 0/96 [00:00<?, ?it/s]

23:   0%|          | 0/96 [00:00<?, ?it/s]

24:   0%|          | 0/96 [00:00<?, ?it/s]

25:   0%|          | 0/169 [00:00<?, ?it/s]

26:   0%|          | 0/103 [00:00<?, ?it/s]

27:   0%|          | 0/96 [00:00<?, ?it/s]

28:   0%|          | 0/107 [00:00<?, ?it/s]

29:   0%|          | 0/96 [00:00<?, ?it/s]

30:   0%|          | 0/96 [00:00<?, ?it/s]

31:   0%|          | 0/104 [00:00<?, ?it/s]

32:   0%|          | 0/116 [00:00<?, ?it/s]

33:   0%|          | 0/96 [00:00<?, ?it/s]

34:   0%|          | 0/96 [00:00<?, ?it/s]

35:   0%|          | 0/163 [00:00<?, ?it/s]

36:   0%|          | 0/96 [00:00<?, ?it/s]

37:   0%|          | 0/96 [00:00<?, ?it/s]

38:   0%|          | 0/96 [00:00<?, ?it/s]

39:   0%|          | 0/96 [00:00<?, ?it/s]

40:   0%|          | 0/96 [00:00<?, ?it/s]

41:   0%|          | 0/96 [00:00<?, ?it/s]

42:   0%|          | 0/95 [00:00<?, ?it/s]

43:   0%|          | 0/96 [00:00<?, ?it/s]

44:   0%|          | 0/96 [00:00<?, ?it/s]

45:   0%|          | 0/96 [00:00<?, ?it/s]

46:   0%|          | 0/96 [00:00<?, ?it/s]

47:   0%|          | 0/95 [00:00<?, ?it/s]

48:   0%|          | 0/96 [00:00<?, ?it/s]

49:   0%|          | 0/96 [00:00<?, ?it/s]

50:   0%|          | 0/96 [00:00<?, ?it/s]

51:   0%|          | 0/96 [00:00<?, ?it/s]

52:   0%|          | 0/96 [00:00<?, ?it/s]

53:   0%|          | 0/111 [00:00<?, ?it/s]

54:   0%|          | 0/96 [00:00<?, ?it/s]

55:   0%|          | 0/89 [00:00<?, ?it/s]

56:   0%|          | 0/96 [00:00<?, ?it/s]

57:   0%|          | 0/96 [00:00<?, ?it/s]

58:   0%|          | 0/96 [00:00<?, ?it/s]

59:   0%|          | 0/96 [00:00<?, ?it/s]

60:   0%|          | 0/96 [00:00<?, ?it/s]

61:   0%|          | 0/96 [00:00<?, ?it/s]

62:   0%|          | 0/71 [00:00<?, ?it/s]

63:   0%|          | 0/96 [00:00<?, ?it/s]

64:   0%|          | 0/96 [00:00<?, ?it/s]

65:   0%|          | 0/96 [00:00<?, ?it/s]

66:   0%|          | 0/96 [00:00<?, ?it/s]

67:   0%|          | 0/96 [00:00<?, ?it/s]

68:   0%|          | 0/123 [00:00<?, ?it/s]

69:   0%|          | 0/96 [00:00<?, ?it/s]

70:   0%|          | 0/157 [00:00<?, ?it/s]

71:   0%|          | 0/94 [00:00<?, ?it/s]

72:   0%|          | 0/95 [00:00<?, ?it/s]

73:   0%|          | 0/154 [00:00<?, ?it/s]

74:   0%|          | 0/96 [00:00<?, ?it/s]

75:   0%|          | 0/111 [00:00<?, ?it/s]

76:   0%|          | 0/79 [00:00<?, ?it/s]

77:   0%|          | 0/96 [00:00<?, ?it/s]

78:   0%|          | 0/96 [00:00<?, ?it/s]

79:   0%|          | 0/141 [00:00<?, ?it/s]

80:   0%|          | 0/96 [00:00<?, ?it/s]

81:   0%|          | 0/96 [00:00<?, ?it/s]

82:   0%|          | 0/96 [00:00<?, ?it/s]

83:   0%|          | 0/96 [00:00<?, ?it/s]

84:   0%|          | 0/87 [00:00<?, ?it/s]

85:   0%|          | 0/96 [00:00<?, ?it/s]

86:   0%|          | 0/96 [00:00<?, ?it/s]

87:   0%|          | 0/96 [00:00<?, ?it/s]

88:   0%|          | 0/96 [00:00<?, ?it/s]

89:   0%|          | 0/112 [00:00<?, ?it/s]

90:   0%|          | 0/96 [00:00<?, ?it/s]

91:   0%|          | 0/96 [00:00<?, ?it/s]

92:   0%|          | 0/96 [00:00<?, ?it/s]

93:   0%|          | 0/104 [00:00<?, ?it/s]

94:   0%|          | 0/96 [00:00<?, ?it/s]

95:   0%|          | 0/96 [00:00<?, ?it/s]

96:   0%|          | 0/95 [00:00<?, ?it/s]

97:   0%|          | 0/96 [00:00<?, ?it/s]

98:   0%|          | 0/96 [00:00<?, ?it/s]

99:   0%|          | 0/96 [00:00<?, ?it/s]

In [18]:
with open('/Users/kx/Desktop/gist.json', 'w') as f:
    json.dump(img_gists, f)