In [1]:
# train multilabel classifier, input = description, output = tags
# for each image, knn find top 20 most visualy similar with resnet feature
# query time, predict tags based on description
# find most similar tag
# find according image
# find top 20 images related

from pprint import pprint
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
from autocorrect import spell
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import NearestNeighbors as KNN
import csv
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

### Tags 0-1

In [2]:
def process_tags(in_path, out_name, isTrain):
    tags_pool = []
    tags = []
    if isTrain:
        cnt = 10000
    else:
        cnt = 2000
    for i in range(cnt):
        tag_path = in_path + str(i) + '.txt'
        tag_file = open(tag_path, 'r')

        img_tag = []      
        lines = tag_file.readlines()
        for line in lines:
            tag = line.strip("\n").split(":")[1].replace(' ', '')
            img_tag.append(tag)
            if tag not in tags_pool:
                tags_pool.append(tag)
        tags.append(img_tag)
    print(tags_pool)
    print(len(tags_pool))
    pprint(tags)
    print(len(tags))
    
    cv = CountVectorizer(vocabulary = tags_pool)
    final_tags = [' '.join(tag) for tag in tags]
    print(final_tags[:5])
    tags_0_1 = cv.fit_transform(final_tags).toarray()
    print(tags_0_1[:5])
    
    np.save(out_name, tags_0_1)
    print('Saved tags_0_1 to ' + out_name + '.')

In [3]:
train_tags_path = 'preprocessing/tags_train_0_1.npy'
if os.path.exists(train_tags_path):
    tags_train_0_1 = np.load(train_tags_path)
    print('Loaded tags_0_1 from ' + train_tags_path + '.')
else:
    tags_train_0_1 = process_tags('data/tags_train/', 'tags_train_0_1', True)
    
test_tags_path = 'preprocessing/tags_2000_0_1.npy'
if os.path.exists(test_tags_path):
    tags_2000_0_1 = np.load(test_tags_path)
    print('Loaded tags_0_1 from ' + test_tags_path + '.')
else:
    tags_2000_0_1 = process_tags('data/tags_test/', 'tags_2000_0_1', True)

Loaded tags_0_1 from preprocessing/tags_train_0_1.npy.
Loaded tags_0_1 from preprocessing/tags_2000_0_1.npy.


### Load TFIDF all

In [4]:
def process_corpus(train_path, test_path, out_name, isNoun):
    tokenizer = RegexpTokenizer(r'\w+')
    lmtzr = WordNetLemmatizer()
    
    corpus = []
    for i in range(10000):
        desc_file = open(train_path + str(i) + '.txt', 'r')
        desc = ' '.join(desc_file.readlines())

        tokens = tokenizer.tokenize(desc)
        wordtags = pos_tag(tokens)
        
        if isNoun:
            nouns = [word.lower() for word, pos in wordtags if (pos == 'NN')]
        else:
            nouns = [spell(token.lower()) for token in tokens]

        nouns = [lmtzr.lemmatize(noun, "v") for noun in nouns]
        nouns = [lmtzr.lemmatize(noun, "n") for noun in nouns]
        nouns = [lmtzr.lemmatize(noun, "a") for noun in nouns]
        nouns = [lmtzr.lemmatize(noun, "r") for noun in nouns]
        nouns = [noun for noun in nouns if noun not in stopwords.words('english')]
        corpus.append(' '.join(nouns)) 

    print(corpus[:5])   
    print('Done processing training descriptions.')
    
    query_corpus = []
    for i in range(2000): 
        query_file = open(test_path + str(i) + '.txt', 'r')
        query_desc = ' '.join(query_file.readlines())

        query_tokens = tokenizer.tokenize(query_desc)
        query_wordtags = pos_tag(query_tokens)
        
        if isNoun:
            query_nouns = [word.lower() for word, pos in query_wordtags if (pos == 'NN')]
        else:
            query_nouns = [spell(token.lower()) for token in query_tokens]

        query_nouns = [lmtzr.lemmatize(noun, "v") for noun in query_nouns]
        query_nouns = [lmtzr.lemmatize(noun, "n") for noun in query_nouns]
        query_nouns = [lmtzr.lemmatize(noun, "a") for noun in query_nouns]
        query_nouns = [lmtzr.lemmatize(noun, "r") for noun in query_nouns]
        query_nouns = [noun for noun in query_nouns if noun not in stopwords.words('english')]
        query_corpus.append(' '.join(query_nouns))
    
    print(query_corpus[:5])
    print('Done processing query descriptions.')
    
    corpus_all = corpus + query_corpus
    print('Merged.')
    
    np.save(out_name, corpus_all)
    print('Saved corpus to ' + out_name + '.')

In [5]:
def process_tfidf(corpus, out_name):
    cv = CountVectorizer(min_df = 3)
    X_all_bow = cv.fit_transform(corpus).toarray()
    vocab = np.array(cv.get_feature_names())
    transformer = TfidfTransformer()
    X_all_tfidf = transformer.fit_transform(X_all_bow).toarray()

    print(vocab)
    print('vocab.shape:', vocab.shape)
    print(X_all_tfidf[:10])
    
    np.save(out_name, np.asarray(X_all_tfidf))
    print('Saved TFIDF to ' + out_name + '.')

In [6]:
tfidf_path = 'preprocessing/X_all_tfidf.npy'
if os.path.exists(tfidf_path):
    tfidf_all = np.load(tfidf_path)
    print('Loaded TFIDF from file.')
    print(tfidf_all[:5])
else:
    corpus_all = process_corpus('data/descriptions_train/', 'data/descriptions_test/', 'corpus_all', False)
    tfidf_all = process_tfidf(corpus_all, 'X_all_tfidf')

tfidf_train = tfidf_all[:10000]
tfidf_test = tfidf_all[10000:]
print(tfidf_train.shape, tfidf_test.shape)

Loaded TFIDF from file.
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(10000, 3311) (2000, 3311)


### SVM, in: description TFIDFs, out: pred tags

In [8]:
predict_tags = []
for i in range(80):
    y = tags_train_0_1[:,i]
    clf = LinearSVC()
    clf.fit(tfidf_train, y)
    tag = clf.predict(tfidf_test)
    predict_tags.append(tag)

predict_tags_trans = np.asarray(predict_tags).T
print(predict_tags_trans.shape)

(2000, 80)


### KNN, fit with all test tags, in: query tag, out: img id

In [9]:
near = KNN(n_neighbors = 20).fit(tags_2000_0_1)
preds = near.kneighbors(predict_tags_trans, return_distance = False)
print(preds)
print(preds.shape)

[[1862 1698  359 ...  388  281  465]
 [ 589  833 1341 ...  171  948  763]
 [ 720  598 1107 ... 1464  228 1493]
 ...
 [ 104  199 1481 ...  674   58  429]
 [1660 1135 1535 ...  210  226  243]
 [ 926 1342  729 ...  308  468 1135]]
(2000, 20)


### Format submission

In [10]:
def format_submission(preds, out_name):
    out = []
    for pred in preds:
        pred = [str(iid) + '.jpg' for iid in pred]
        out.append(' '.join(pred))
    print(out[:10])

    out_files = []
    for i in range(2000):
        out_files.append(str(i)+'.txt')
    with open(out_name, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['Descritpion_ID', 'Top_20_Image_IDs'])
        writer.writerows(zip(out_files, out))
    print('Submission:', out_name)
                         

In [12]:
format_submission(preds, 'tfidf_svc.csv')

['1862.jpg 1698.jpg 359.jpg 1231.jpg 1131.jpg 1743.jpg 942.jpg 1479.jpg 36.jpg 124.jpg 292.jpg 425.jpg 381.jpg 141.jpg 250.jpg 484.jpg 364.jpg 388.jpg 281.jpg 465.jpg', '589.jpg 833.jpg 1341.jpg 1907.jpg 1156.jpg 1529.jpg 1071.jpg 562.jpg 121.jpg 607.jpg 753.jpg 758.jpg 956.jpg 809.jpg 869.jpg 860.jpg 452.jpg 171.jpg 948.jpg 763.jpg', '720.jpg 598.jpg 1107.jpg 953.jpg 1713.jpg 159.jpg 1955.jpg 545.jpg 1067.jpg 272.jpg 456.jpg 755.jpg 662.jpg 1909.jpg 43.jpg 1651.jpg 1060.jpg 1464.jpg 228.jpg 1493.jpg', '654.jpg 1865.jpg 75.jpg 1513.jpg 1922.jpg 800.jpg 836.jpg 1035.jpg 1486.jpg 397.jpg 799.jpg 90.jpg 686.jpg 930.jpg 897.jpg 1135.jpg 1220.jpg 469.jpg 152.jpg 66.jpg', '292.jpg 1231.jpg 1743.jpg 942.jpg 36.jpg 359.jpg 1131.jpg 311.jpg 95.jpg 152.jpg 381.jpg 388.jpg 141.jpg 760.jpg 250.jpg 1026.jpg 50.jpg 897.jpg 949.jpg 484.jpg', '51.jpg 1322.jpg 1118.jpg 1335.jpg 887.jpg 1045.jpg 771.jpg 1287.jpg 554.jpg 703.jpg 354.jpg 814.jpg 334.jpg 46.jpg 699.jpg 274.jpg 492.jpg 614.jpg 526.jpg 330.j

### Pairwise distance 2000 * 2000

In [11]:
from sklearn.metrics.pairwise import euclidean_distances as ed
dist_svc = ed(predict_tags_trans, tags_test)
print(dist_svc[:5])
print(dist_svc.shape)
np.save('dist_svc', dist_svc)

[[2.64575131 2.         1.73205081 ... 2.23606798 2.         2.82842712]
 [2.44948974 1.73205081 2.         ... 2.         1.73205081 2.23606798]
 [2.64575131 2.         2.23606798 ... 2.23606798 1.41421356 2.82842712]
 [2.44948974 1.73205081 1.41421356 ... 2.         1.73205081 2.64575131]
 [2.44948974 1.73205081 1.41421356 ... 2.         1.73205081 2.64575131]]
(2000, 2000)
