In [118]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from skimage import color, io

from sklearn.model_selection import train_test_split
import numpy
import os
from PIL import Image
PHOTO_PREFIX="photo_"
BLANK=""

ps = PorterStemmer()
wl = WordNetLemmatizer()

In [119]:
picture_tags = pd.read_csv("./Metadata.csv")
picture_tags.columns = ["tagId","photoId","tagName"]
print(picture_tags.head())
#Normalize words to lower case
picture_tags['tagName'] = picture_tags['tagName'].apply(lambda x:x.lower())
#Stemming words
picture_tags['tagName'] = picture_tags['tagName'].apply(lambda x:ps.stem(x))
#Lemming words
picture_tags['tagName'] = picture_tags['tagName'].apply(lambda x:wl.lemmatize(x))
#Tokenize words - Not Required?
# picture_tags['tagName'] = picture_tags['tagName'].apply(lambda x:(x[:5]))
print("Unique list size %i" % len(picture_tags['tagId'].unique()))
print("Complete list size %i" % len(picture_tags['tagId']))


   tagId  photoId      tagName
0    166      169          Bag
1    167      169       Bucket
2    308     1059          Pen
3    309     1068   Home decor
4    310     1068  Inspiration
Unique list size 2229
Complete list size 2229


In [120]:
gphotoId= picture_tags.groupby(['photoId']).agg({"tagName":lambda x: ",".join(x.tolist())}).reset_index()
gtagsId= picture_tags.groupby(['photoId']).agg({"tagId":lambda x: ",".join(str(v) for v in x.tolist())})


In [121]:
gphotoId['photoId'].head()



0     169
1    1059
2    1068
3    1069
4    1073
Name: photoId, dtype: int64

In [126]:
def get_broken_refs(folder, data):
    broken_refs_photos=[]
    for index, row in data.iterrows():
        if not os.path.exists(os.path.join(folder, str(row['photoId'])+".jpg")):
            broken_refs_photos.append(row['photoId'])
    return broken_refs_photos
def rename_images(folder):
    files = os.listdir(folder)
    for f in files:
        f_new = f.lower().replace(PHOTO_PREFIX, BLANK)
        if f_new != "" and os.path.isfile(os.path.join(folder, f_new)):
            f_new = f_new.split(".")[0]+".jpg"
        else:
            continue
        if f_new:
            os.rename(os.path.join(folder, f), os.path.join(folder, f_new))
    print("Images renamed")
    
def resize_images(folder):
    files=os.listdir(folder)
    for f in files:
            im = Image.open(folder+"/"+f).convert('RGB')
            im=im.resize((350,350),  Image.ANTIALIAS)
            im.save(folder+"/resized/"+f, dpi=(350,350) )
    print("Images resized")

def get_images(folder,data):
    images=[]
    for index, row in data.iterrows():
        im = io.imread(folder+"/"+str(row['photoId'])+".jpg")
        images.append(preprocess_img(im))
    return images
def preprocess_img(img):
    # Histogram normalization in v channel
    hsv = color.rgb2hsv(img)
    img = color.hsv2rgb(hsv)
    # roll color axis to axis 0
    img = numpy.rollaxis(img, -1)

    return img

In [127]:
current_folder = os.getcwd()
workspace = current_folder +"/Images"
# rename_images(workspace)
# resize_images(workspace)

In [128]:
missing_refs = get_broken_refs(workspace+"/resized", gphotoId)
gphotoId = gphotoId[gphotoId['photoId'].isin(missing_refs) == False]
gphotoId.shape

(892, 2)

In [141]:
nltk.edit_distance("home deco", "home decor")
images = get_images(current_folder +"/Images/resized",gphotoId)
gphotoId['picture']=images
gphotoId.shape


[array([[[ 0.49019608,  0.36470588,  0.34117647, ...,  0.49803922,
           0.49019608,  0.48627451],
         [ 0.48235294,  0.34901961,  0.34509804, ...,  0.48627451,
           0.47843137,  0.4745098 ],
         [ 0.46666667,  0.32941176,  0.34509804, ...,  0.47843137,
           0.4745098 ,  0.4745098 ],
         ..., 
         [ 0.07058824,  0.10588235,  0.19215686, ...,  0.16078431,
           0.16078431,  0.16078431],
         [ 0.0627451 ,  0.10588235,  0.21568627, ...,  0.16078431,
           0.16078431,  0.15686275],
         [ 0.05882353,  0.11372549,  0.21960784, ...,  0.16470588,
           0.16078431,  0.15686275]],
 
        [[ 0.49411765,  0.36470588,  0.33333333, ...,  0.43529412,
           0.42745098,  0.42352941],
         [ 0.48627451,  0.34509804,  0.32941176, ...,  0.42352941,
           0.41568627,  0.41176471],
         [ 0.46666667,  0.3254902 ,  0.3254902 , ...,  0.41568627,
           0.41176471,  0.41176471],
         ..., 
         [ 0.11372549,  0.08627

In [142]:
X_train, X_test, y_train, y_test = train_test_split(images, gphotoId['tagName'], test_size=0.2)


[array([[[ 0.96862745,  0.96862745,  0.96862745, ...,  0.98039216,
           0.98039216,  0.98039216],
         [ 0.96862745,  0.96862745,  0.96862745, ...,  0.98039216,
           0.98039216,  0.98039216],
         [ 0.96862745,  0.96862745,  0.96862745, ...,  0.98039216,
           0.98039216,  0.98039216],
         ..., 
         [ 0.        ,  0.        ,  0.00784314, ...,  0.7372549 ,
           0.77254902,  0.77647059],
         [ 0.00784314,  0.01176471,  0.01960784, ...,  0.73333333,
           0.77254902,  0.77254902],
         [ 0.01176471,  0.01176471,  0.01176471, ...,  0.74117647,
           0.77254902,  0.75686275]],
 
        [[ 0.93333333,  0.93333333,  0.93333333, ...,  0.96078431,
           0.96078431,  0.96078431],
         [ 0.93333333,  0.93333333,  0.93333333, ...,  0.96078431,
           0.96078431,  0.96078431],
         [ 0.93333333,  0.93333333,  0.93333333, ...,  0.96078431,
           0.96078431,  0.96078431],
         ..., 
         [ 0.03137255,  0.03921

In [117]:
import numpy
from keras.datasets import cifar10
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from keras import backend as K
K.set_image_dim_ordering('th')

In [144]:
# X_train
X_train = X_train.astype('float32')
# X_test = X_test.astype('float32')
# X_train = X_train / 255.0
# X_test = X_test / 255.0

AttributeError: 'list' object has no attribute 'astype'

In [84]:
X_train.head()

824    [[[255, 255, 255], [255, 255, 255], [255, 255,...
749    [[[201, 198, 193], [201, 198, 193], [201, 198,...
792    [[[191, 191, 181], [192, 192, 182], [193, 193,...
521    [[[242, 242, 242], [242, 242, 242], [242, 242,...
519    [[[216, 234, 246], [216, 234, 246], [216, 234,...
Name: picture, dtype: object

In [132]:
def img_model():
    model = Sequential()

    model.add(Conv2D(10, (3, 3), padding='same',
         input_shape=(3, 350, 350),
         activation='relu'))
    model.add(Conv2D(10, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(20, (3, 3), padding='same',
         activation='relu'))
    model.add(Conv2D(20, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(40, (3, 3), padding='same',
         activation='relu'))
    model.add(Conv2D(40, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Flatten())
    model.add(Dense(80, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='softmax'))
    return model


In [133]:
lr = 0.01
sgd = SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=True)
model = img_model()
model.compile(loss='categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy'])

In [143]:
from keras.callbacks import LearningRateScheduler, ModelCheckpoint

def lr_schedule(epoch):
    return lr * (0.1 ** int(epoch / 10))
batch_size = 32
epochs = 30

model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2,
          callbacks=[LearningRateScheduler(lr_schedule),
                     ModelCheckpoint('model.h5', save_best_only=True)]
          )

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 arrays but instead got the following list of 713 arrays: [array([[[ 0.96862745,  0.96862745,  0.96862745, ...,  0.98039216,
          0.98039216,  0.98039216],
        [ 0.96862745,  0.96862745,  0.96862745, ...,  0.98039216,
          0.98039216,  0.980392...