In [118]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from skimage import color, io

from sklearn.model_selection import train_test_split
import numpy
import os
from PIL import Image
PHOTO_PREFIX="photo_"
BLANK=""

ps = PorterStemmer()
wl = WordNetLemmatizer()

In [269]:
picture_tags = pd.read_csv("./Metadata.csv")
picture_tags.columns = ["tagId","photoId","tagName"]
print(picture_tags.head())
#Normalize words to lower case
picture_tags['tagName'] = picture_tags['tagName'].apply(lambda x:x.lower())
#Stemming words
picture_tags['tagName'] = picture_tags['tagName'].apply(lambda x:ps.stem(x))
#Lemming words
picture_tags['tagName'] = picture_tags['tagName'].apply(lambda x:wl.lemmatize(x))
#Tokenize words - Not Required?
# picture_tags['tagName'] = picture_tags['tagName'].apply(lambda x:(x[:5]))
print("Unique list size %i" % len(picture_tags['tagId'].unique()))
print("Complete list size %i" % len(picture_tags['tagId']))


   tagId  photoId      tagName
0    166      169          Bag
1    167      169       Bucket
2    308     1059          Pen
3    309     1068   Home decor
4    310     1068  Inspiration
Unique list size 2229
Complete list size 2229


In [270]:
gphotoId= picture_tags.groupby(['photoId']).agg({"tagName":lambda x: ",".join(x.tolist())}).reset_index()
gtagsId= picture_tags.groupby(['photoId']).agg({"tagId":lambda x: ",".join(str(v) for v in x.tolist())})


In [271]:
picture_tags.head()



Unnamed: 0,tagId,photoId,tagName
0,166,169,bag
1,167,169,bucket
2,308,1059,pen
3,309,1068,home decor
4,310,1068,inspir


In [146]:
def get_broken_refs(folder, data):
    broken_refs_photos=[]
    for index, row in data.iterrows():
        if not os.path.exists(os.path.join(folder, str(row['photoId'])+".jpg")):
            broken_refs_photos.append(row['photoId'])
    return broken_refs_photos
def rename_images(folder):
    files = os.listdir(folder)
    for f in files:
        f_new = f.lower().replace(PHOTO_PREFIX, BLANK)
        if f_new != "" and os.path.isfile(os.path.join(folder, f_new)):
            f_new = f_new.split(".")[0]+".jpg"
        else:
            continue
        if f_new:
            os.rename(os.path.join(folder, f), os.path.join(folder, f_new))
    print("Images renamed")
    
def resize_images(folder):
    files=os.listdir(folder)
    for f in files:
            im = Image.open(folder+"/"+f).convert('RGB')
            im=im.resize((350,350),  Image.ANTIALIAS)
            im.save(folder+"/resized/"+f, dpi=(350,350) )
    print("Images resized")

def get_images(folder,data):
    images=[]
    for index, row in data.iterrows():
        im = io.imread(folder+"/"+str(row['photoId'])+".jpg")
        images.append(preprocess_img(im))
    return numpy.array(images)
def preprocess_img(img):
    # Histogram normalization in v channel
    hsv = color.rgb2hsv(img)
    img = color.hsv2rgb(hsv)
    # roll color axis to axis 0
    img = numpy.rollaxis(img, -1)

    return img

In [127]:
current_folder = os.getcwd()
workspace = current_folder +"/Images"
# rename_images(workspace)
# resize_images(workspace)

In [272]:
missing_refs = get_broken_refs(workspace+"/resized", picture_tags)
picture_tags = picture_tags[picture_tags['photoId'].isin(missing_refs) == False]
gphotoId.shape

(905, 2)

In [273]:
nltk.edit_distance("home deco", "home decor")
images = get_images(current_folder +"/Images/resized",picture_tags)

# images = get_images(current_folder +"/Images/resized",gphotoId)
# gphotoId['picture']=images
# gphotoId.shape


In [265]:
# import numpy as np
# tags = gphotoId['tagName'].apply(lambda x : x.split(','))
# alltagsList = []
# for i in tags:
#     for _ in i:
#         alltagsList.append(_)
        
# from collections import Counter

# tagsCounter = Counter(alltagsList)
# tagsCounter = dict(tagsCounter)
# count = 0
# for key,value in tagsCounter.items():
#     tagsCounter[key] = count
#     count += 1

# listtags = {}


# for i in tags:
#     for ch in i:
#         listtags[ch]= tagsCounter[ch]
# #     listIdx += 1
# pprint.pprint(listtags)

In [274]:
picture_tags.head()

Unnamed: 0,tagId,photoId,tagName
0,166,169,bag
1,167,169,bucket
2,308,1059,pen
3,309,1068,home decor
4,310,1068,inspir


In [275]:
X_train, X_test, y_train, y_test = train_test_split(images,picture_tags['tagId'], test_size=0.2)


In [276]:
import numpy
from keras.datasets import cifar10
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from keras import backend as K
K.set_image_dim_ordering('th')

In [277]:
X_train
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
# X_train = X_train / 255.0
# X_test = X_test / 255.0

In [283]:
y_train = y_train/7278

In [279]:
def img_model():
    model = Sequential()

    model.add(Conv2D(10, (3, 3), padding='same',
         input_shape=(3, 350, 350),
         activation='relu'))
    model.add(Conv2D(10, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(20, (3, 3), padding='same',
         activation='relu'))
    model.add(Conv2D(20, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(40, (3, 3), padding='same',
         activation='relu'))
    model.add(Conv2D(40, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Flatten())
    model.add(Dense(80, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='softmax'))
    return model


In [280]:
lr = 0.5
sgd = SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True)
model = img_model()
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])

In [None]:
from keras.callbacks import LearningRateScheduler, ModelCheckpoint

def lr_schedule(epoch):
    return lr * (0.1 ** int(epoch / 10))
batch_size = 32
epochs = 10

model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2,
          callbacks=[LearningRateScheduler(lr_schedule)]
          )

Train on 1401 samples, validate on 351 samples
Epoch 1/30
  96/1401 [=>............................] - ETA: 520s - loss: 0.0000e+00 - acc: 0.0000e+00

### 