In [1]:
# train multilabel classifier, input = description, output = tags
# for each image, knn find top 20 most visualy similar with resnet feature
# query time, predict tags based on description
# find most similar tag
# find according image
# find top 20 images related

from pprint import pprint
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
from autocorrect import spell
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import NearestNeighbors as KNN
import csv
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

### Tags 0-1

In [3]:
def process_tags(in_path, out_name, isTrain):
    tags_pool = []
    tags = []
    if isTrain:
        cnt = 10000
    else:
        cnt = 2000
    for i in range(cnt):
        tag_path = in_path + str(i) + '.txt'
        tag_file = open(tag_path, 'r')

        img_tag = []      
        lines = tag_file.readlines()
        for line in lines:
            tag = line.strip("\n").split(":")[1].replace(' ', '')
            img_tag.append(tag)
            if tag not in tags_pool:
                tags_pool.append(tag)
        tags.append(img_tag)
    print(tags_pool)
    print(len(tags_pool))
    pprint(tags)
    print(len(tags))
    
    cv = CountVectorizer(vocabulary = tags_pool)
    final_tags = [' '.join(tag) for tag in tags]
    print(final_tags[:5])
    tags_0_1 = cv.fit_transform(final_tags).toarray()
    print(tags_0_1[:5])
    
    np.save(out_name, tags_0_1)
    print('Saved tags_0_1 to ' + out_name + '.')

In [5]:
train_tags_path = 'preprocessing/tags_train_0_1.npy'
if os.path.exists(train_tags_path):
    tags_train_0_1 = np.load(train_tags_path)
    print('Loaded tags_0_1 from ' + train_tags_path + '.')
else:
    tags_train_0_1 = process_tags('data/tags_train/', 'tags_train_0_1', True)
    
test_tags_path = 'preprocessing/tags_2000_0_1.npy'
if os.path.exists(test_tags_path):
    tags_2000_0_1 = np.load(test_tags_path)
    print('Loaded tags_0_1 from ' + test_tags_path + '.')
else:
    tags_2000_0_1 = process_tags('data/tags_test/', 'tags_2000_0_1', True)

Loaded tags_0_1 from preprocessing/tags_train_0_1.npy.
Loaded tags_0_1 from preprocessing/tags_2000_0_1.npy.


### Load TFIDF all

In [6]:
def process_corpus(train_path, test_path, out_name, isNoun):
    tokenizer = RegexpTokenizer(r'\w+')
    lmtzr = WordNetLemmatizer()
    
    corpus = []
    for i in range(10000):
        desc_file = open(train_path + str(i) + '.txt', 'r')
        desc = ' '.join(desc_file.readlines())

        tokens = tokenizer.tokenize(desc)
        wordtags = pos_tag(tokens)
        
        if isNoun:
            nouns = [word.lower() for word, pos in wordtags if (pos == 'NN')]
        else:
            nouns = [spell(token.lower()) for token in tokens]

        nouns = [lmtzr.lemmatize(noun, "v") for noun in nouns]
        nouns = [lmtzr.lemmatize(noun, "n") for noun in nouns]
        nouns = [lmtzr.lemmatize(noun, "a") for noun in nouns]
        nouns = [lmtzr.lemmatize(noun, "r") for noun in nouns]
        nouns = [noun for noun in nouns if noun not in stopwords.words('english')]
        corpus.append(' '.join(nouns)) 

    print(corpus[:5])   
    print('Done processing training descriptions.')
    
    query_corpus = []
    for i in range(2000): 
        query_file = open(test_path + str(i) + '.txt', 'r')
        query_desc = ' '.join(query_file.readlines())

        query_tokens = tokenizer.tokenize(query_desc)
        query_wordtags = pos_tag(query_tokens)
        
        if isNoun:
            query_nouns = [word.lower() for word, pos in query_wordtags if (pos == 'NN')]
        else:
            query_nouns = [spell(token.lower()) for token in query_tokens]

        query_nouns = [lmtzr.lemmatize(noun, "v") for noun in query_nouns]
        query_nouns = [lmtzr.lemmatize(noun, "n") for noun in query_nouns]
        query_nouns = [lmtzr.lemmatize(noun, "a") for noun in query_nouns]
        query_nouns = [lmtzr.lemmatize(noun, "r") for noun in query_nouns]
        query_nouns = [noun for noun in query_nouns if noun not in stopwords.words('english')]
        query_corpus.append(' '.join(query_nouns))
    
    print(query_corpus[:5])
    print('Done processing query descriptions.')
    
    corpus_all = corpus + query_corpus
    print('Merged.')
    
    np.save(out_name, corpus_all)
    print('Saved corpus to ' + out_name + '.')

In [7]:
def process_tfidf(corpus, out_name):
    cv = CountVectorizer(min_df = 3)
    X_all_bow = cv.fit_transform(corpus).toarray()
    vocab = np.array(cv.get_feature_names())
    transformer = TfidfTransformer()
    X_all_tfidf = transformer.fit_transform(X_all_bow).toarray()

    print(vocab)
    print('vocab.shape:', vocab.shape)
    print(X_all_tfidf[:10])
    
    np.save(out_name, np.asarray(X_all_tfidf))
    print('Saved TFIDF to ' + out_name + '.')

In [10]:
tfidf_path = 'preprocessing/X_all_tfidf.npy'
if os.path.exists(tfidf_path):
    tfidf_all = np.load(tfidf_path)
    print('Loaded TFIDF from file.')
    print(tfidf_all[:5])
else:
    corpus_all = process_corpus('data/descriptions_train/', 'data/descriptions_test/', 'corpus_all', False)
    tfidf_all = process_tfidf(corpus_all, 'X_all_tfidf')

tfidf_train = tfidf_all[:10000]
tfidf_test = tfidf_all[10000:]
print(tfidf_train.shape, tfidf_test.shape)

Loaded TFIDF from file.
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(10000, 3311) (2000, 3311)


In [9]:
# opt
from sklearn.decomposition import PCA
pca_tfidf = PCA(n_components=1000, svd_solver='auto')
pca_tfidf.fit(tfidf_all)
X_all_tfidf_pca = pca_tfidf.transform(tfidf_all)
X_tfidf_pca = X_all_tfidf_pca[:10000]
X_query_tfidf_pca = X_all_tfidf_pca[10000:]

### Logistic regression, in: description TFIDFs, out: pred tags

In [11]:
predict_tags = []
for i in range(80):
    y = tags_train_0_1[:,i]
    clf = LogisticRegression()
    clf.fit(tfidf_train, y)
    tag = clf.predict(tfidf_test)
    predict_tags.append(tag)

predict_tags_trans = np.asarray(predict_tags).T
print(predict_tags_trans.shape)

(2000, 80)


In [51]:
# opt
predict_tags_pca = []
for i in range(80):
    y = tags_train_0_1[:,i]
    clf = LogisticRegression()
    clf.fit(X_tfidf_pca, y)
    tag = clf.predict(X_query_tfidf_pca)
    predict_tags_pca.append(tag)

predict_tags_pca_trans = np.asarray(predict_tags_pca).T
print(predict_tags_pca_trans.shape)

(2000, 80)


### KNN, fit with all test tags, in: query tag, out: img id

In [12]:
near = KNN(n_neighbors = 20).fit(tags_2000_0_1)
preds = near.kneighbors(predict_tags_trans, return_distance = False)
print(preds)
print(preds.shape)

[[1862 1698  359 ...  388  281  465]
 [ 589  833 1341 ...  171  948  763]
 [1840  649 1724 ... 1089  283  953]
 ...
 [ 104  199 1481 ...  674   58  429]
 [1429  141 1660 ...  269  151  145]
 [ 926 1342  729 ...  308  468 1135]]
(2000, 20)


In [52]:
# opt
near_pca = KNN(n_neighbors = 20).fit(tags_2000_0_1)
preds_pca = near_pca.kneighbors(predict_tags_pca_trans, return_distance = False)
print(preds_pca.shape)

(2000, 20)


### (bad) 1NN, in: query tag, out: nearest img id; KNN, fit with imgf, in: nearest imgf, out: 20 img ids

In [13]:
def process_imgf(in_name, out_name):
    imgf = {}
    csvfile = open(in_name, 'r')
    lines = csvfile.readlines()
    for line in lines:
        iid = int(line.split(",")[0].split("/")[1].split(".")[0])
        imgf[iid] = np.asarray([float(s) for s in line.split(",")[1:]])    

    sorted_imgf = np.asarray([imgf[key] for key in sorted(imgf.keys())])
    np.save(out_name, sorted_imgf)
    print('Sorted ' + in_name + ' saved to ' + out_name + '.')
    
    return sorted_imgf

In [14]:
train_imf_path = 'preprocessing/sorted_fc1000.npy'
if os.path.exists(train_imf_path):
    sorted_fc1000 = np.load(train_imf_path)
    print('Loaded image feature from ' + train_imf_path + '.')
else:
    sorted_fc1000 = process_imgf('data/features_train/features_resnet1000intermediate_train.csv', 'sorted_pool5')

test_imgf_path = 'preprocessing/sorted_fc1000_test.npy'
if os.path.exists(test_imgf_path):
    sorted_fc1000_test = np.load(test_imgf_path)
    print('Loaded image feature from ' + test_imgf_path + '.')
else:
    sorted_fc1000_test = process_imgf('data/features_test/features_resnet1000intermediate_test.csv', 'sorted_pool5_test')

Loaded image feature from preprocessing/sorted_fc1000.npy.
Loaded image feature from preprocessing/sorted_fc1000_test.npy.


In [15]:
nearest = KNN(n_neighbors = 1).fit(tags_2000_0_1)
preds_1 = nearest.kneighbors(predict_tags_trans, return_distance = False)
print(preds_1.shape)

(2000, 1)


In [16]:
near_img = KNN(n_neighbors = 20).fit(sorted_fc1000_test)
preds_img = []
for i in range(2000):
    preds_img.append(near_img.kneighbors(sorted_fc1000_test[preds_1[i]], return_distance = False))
preds_img = np.asarray(preds_img).squeeze()
print(preds_img[:10])

[[1698   68 1243  916 1816 1092  359 1451  912  721 1384  292  670 1231
   742  164  637  416  808 1076]
 [ 589 1068 1147  818  835 1199 1143 1714 1054 1645  201  107  261 1184
    80  482  457  668 1733 1102]
 [ 184 1027 1475 1648 1471  860  904 1421  696 1482  496  338 1962   42
  1437 1023 1040 1183 1084 1413]
 [  75 1035 1112  799 1513 1315  427 1486 1850  153  677 1279 1394  686
   661  770  451  417   66  930]
 [  36  669  683    3 1040  360   46  511 1919  153 1479  969  881 1413
  1902  604   68  781  588 1180]
 [ 274 1913 1558 1886 1837 1882 1254  829   77  846   81 1974 1206 1850
  1339  268 1633  947 1018  673]
 [  33 1265 1191 1020 1885  244 1153  367 1651  489  249  977  309 1655
  1044  899 1940  458 1889  414]
 [1949  975 1363 1857  932 1961 1694  689 1085 1123 1900 1488 1249  770
   409  448  685  987 1270  805]
 [ 266 1600 1031 1644 1348 1389  848  719 1004 1896 1261  923 1901 1336
  1557  459 1656  343 1010 1847]
 [ 274 1913 1558 1886 1837 1882 1254  829   77  846   8

### Format submission

In [17]:
def format_submission(preds, out_name):
    out = []
    for pred in preds:
        pred = [str(iid) + '.jpg' for iid in pred]
        out.append(' '.join(pred))
    print(out[:10])

    out_files = []
    for i in range(2000):
        out_files.append(str(i)+'.txt')
    with open(out_name, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['Descritpion_ID', 'Top_20_Image_IDs'])
        writer.writerows(zip(out_files, out))
    print('Submission:', out_name)
                         

In [18]:
format_submission(preds_img, '1tag20imgs.csv')

['1698.jpg 68.jpg 1243.jpg 916.jpg 1816.jpg 1092.jpg 359.jpg 1451.jpg 912.jpg 721.jpg 1384.jpg 292.jpg 670.jpg 1231.jpg 742.jpg 164.jpg 637.jpg 416.jpg 808.jpg 1076.jpg', '589.jpg 1068.jpg 1147.jpg 818.jpg 835.jpg 1199.jpg 1143.jpg 1714.jpg 1054.jpg 1645.jpg 201.jpg 107.jpg 261.jpg 1184.jpg 80.jpg 482.jpg 457.jpg 668.jpg 1733.jpg 1102.jpg', '184.jpg 1027.jpg 1475.jpg 1648.jpg 1471.jpg 860.jpg 904.jpg 1421.jpg 696.jpg 1482.jpg 496.jpg 338.jpg 1962.jpg 42.jpg 1437.jpg 1023.jpg 1040.jpg 1183.jpg 1084.jpg 1413.jpg', '75.jpg 1035.jpg 1112.jpg 799.jpg 1513.jpg 1315.jpg 427.jpg 1486.jpg 1850.jpg 153.jpg 677.jpg 1279.jpg 1394.jpg 686.jpg 661.jpg 770.jpg 451.jpg 417.jpg 66.jpg 930.jpg', '36.jpg 669.jpg 683.jpg 3.jpg 1040.jpg 360.jpg 46.jpg 511.jpg 1919.jpg 153.jpg 1479.jpg 969.jpg 881.jpg 1413.jpg 1902.jpg 604.jpg 68.jpg 781.jpg 588.jpg 1180.jpg', '274.jpg 1913.jpg 1558.jpg 1886.jpg 1837.jpg 1882.jpg 1254.jpg 829.jpg 77.jpg 846.jpg 81.jpg 1974.jpg 1206.jpg 1850.jpg 1339.jpg 268.jpg 1633.jpg 947