In [4]:
# tfidf encode each train description (noun/all)
# fit plsr with tfidf and imgf (noun/all -> fc1000/pool5)
# query time, pred with plsr
# fit knn with test gt imgf
# find 20 nearest of query tfidf

from pprint import pprint
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
from autocorrect import spell
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import NearestNeighbors as KNN
import csv
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
import pickle

### Load TFIDF nouns

In [1]:
def process_corpus(train_path, test_path, out_name, isNoun):
    tokenizer = RegexpTokenizer(r'\w+')
    lmtzr = WordNetLemmatizer()
    
    corpus = []
    for i in range(10000):
        desc_file = open(train_path + str(i) + '.txt', 'r')
        desc = ' '.join(desc_file.readlines())

        tokens = tokenizer.tokenize(desc)
        wordtags = pos_tag(tokens)
        
        if isNoun:
            nouns = [word.lower() for word, pos in wordtags if (pos == 'NN')]
        else:
            nouns = [spell(token.lower()) for token in tokens]

        nouns = [lmtzr.lemmatize(noun, "v") for noun in nouns]
        nouns = [lmtzr.lemmatize(noun, "n") for noun in nouns]
        nouns = [lmtzr.lemmatize(noun, "a") for noun in nouns]
        nouns = [lmtzr.lemmatize(noun, "r") for noun in nouns]
        nouns = [noun for noun in nouns if noun not in stopwords.words('english')]
        corpus.append(' '.join(nouns)) 

    print(corpus[:5])   
    print('Done processing training descriptions.')
    
    query_corpus = []
    for i in range(2000): 
        query_file = open(test_path + str(i) + '.txt', 'r')
        query_desc = ' '.join(query_file.readlines())

        query_tokens = tokenizer.tokenize(query_desc)
        query_wordtags = pos_tag(query_tokens)
        
        if isNoun:
            query_nouns = [word.lower() for word, pos in query_wordtags if (pos == 'NN')]
        else:
            query_nouns = [spell(token.lower()) for token in query_tokens]

        query_nouns = [lmtzr.lemmatize(noun, "v") for noun in query_nouns]
        query_nouns = [lmtzr.lemmatize(noun, "n") for noun in query_nouns]
        query_nouns = [lmtzr.lemmatize(noun, "a") for noun in query_nouns]
        query_nouns = [lmtzr.lemmatize(noun, "r") for noun in query_nouns]
        query_nouns = [noun for noun in query_nouns if noun not in stopwords.words('english')]
        query_corpus.append(' '.join(query_nouns))
    
    print(query_corpus[:5])
    print('Done processing query descriptions.')
    
    corpus_all = corpus + query_corpus
    print('Merged.')
    
    np.save(out_name, corpus_all)
    print('Saved corpus to ' + out_name + '.')

In [2]:
def process_tfidf(corpus, out_name):
    cv = CountVectorizer(min_df = 3)
    X_all_bow = cv.fit_transform(corpus).toarray()
    vocab = np.array(cv.get_feature_names())
    transformer = TfidfTransformer()
    X_all_tfidf = transformer.fit_transform(X_all_bow).toarray()

    print(vocab)
    print('vocab.shape:', vocab.shape)
    print(X_all_tfidf[:10])
    
    np.save(out_name, np.asarray(X_all_tfidf))
    print('Saved TFIDF to ' + out_name + '.')

In [5]:
tfidf_path = 'preprocessing/X_nouns_tfidf.npy'
if os.path.exists(tfidf_path):
    tfidf_all = np.load(tfidf_path)
    print('Loaded TFIDF from ' + tfidf_path + '.')
    print(tfidf_all[:5])
else:
    corpus_all = process_corpus('data/descriptions_train/', 'data/descriptions_test/', 'corpus_nouns', False)
    tfidf_all = process_tfidf(corpus_all, 'X_nouns_tfidf')

tfidf_train = tfidf_all[:10000]
tfidf_test = tfidf_all[10000:]
print(tfidf_train.shape, tfidf_test.shape)

Loaded TFIDF from preprocessing/X_nouns_tfidf.npy.
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(10000, 1994) (2000, 1994)


### Load fc1000, PCA on TFIDF

In [6]:
def process_imgf(in_name, out_name):
    imgf = {}
    csvfile = open(in_name, 'r')
    lines = csvfile.readlines()
    for line in lines:
        iid = int(line.split(",")[0].split("/")[1].split(".")[0])
        imgf[iid] = np.asarray([float(s) for s in line.split(",")[1:]])    

    sorted_imgf = np.asarray([imgf[key] for key in sorted(imgf.keys())])
    np.save(out_name, sorted_imgf)
    print('Sorted ' + in_name + ' saved to ' + out_name + '.')
    
    return sorted_imgf

In [16]:
train_imf_path = 'preprocessing/sorted_fc1000.npy'
if os.path.exists(train_imf_path):
    sorted_fc1000 = np.load(train_imf_path)
    print('Loaded image feature from ' + train_imf_path + '.')
else:
    sorted_fc1000 = process_imgf('data/features_train/features_resnet1000intermediate_train.csv', 'sorted_pool5')

test_imgf_path = 'preprocessing/sorted_fc1000_test.npy'
if os.path.exists(test_imgf_path):
    sorted_fc1000_test = np.load(test_imgf_path)
    print('Loaded image feature from ' + test_imgf_path + '.')
else:
    sorted_fc1000_test = process_imgf('data/features_test/features_resnet1000intermediate_test.csv', 'sorted_pool5_test')

Loaded image feature from preprocessing/sorted_fc1000.npy.
Loaded image feature from preprocessing/sorted_fc1000_test.npy.


In [11]:
pca_noun_fc1000 = PCA(n_components=1000, svd_solver='auto')
pca_noun_fc1000.fit(tfidf_train)
tfidf_noun_train_PCA = pca_noun_fc1000.transform(tfidf_train)
tfidf_noun_test_PCA = pca_noun_fc1000.transform(tfidf_test)

print(tfidf_noun_train_PCA.shape)

(10000, 1000)


### Load PLSR 400

In [12]:
def fit_plsr(train, preds, n_components, max_iter, out_name):
    plsr = PLSRegression(n_components=n_components, max_iter=max_iter)
    plsr.fit(train, pred)
    print('Done fitting PLSR.')
    pickle.dump(plsr, open(out_name, 'wb'))
    print('Saved PLSR ' + str(n_components) + ' to ' + out_name + '.')

In [18]:
plsr_path = 'models/pls_noun_fc1000_400c.sav'
if os.path.exists(plsr_path):
    with open(plsr_path, 'rb') as f:
        pls_noun_fc1000 = pickle.load(f)
        print('Loaded PLSR from ' + plsr_path + '.')

Loaded PLSR from models/pls_noun_fc1000_400c.sav.


### KNN 20

In [19]:
near = KNN(n_neighbors = 20, metric = 'cosine').fit(sorted_fc1000_test)
preds = near.kneighbors(pls_noun_fc1000.predict(tfidf_noun_test_PCA), return_distance = False)
print(preds[:10])
print(preds.shape)

[[1161  922 1629 1273 1011 1969  619 1510 1480  112 1028 1276  700 1284
  1535   95 1045 1069 1868 1181]
 [ 331 1559 1733  179 1742  428  144 1824   80 1871 1029 1601  571  529
  1806  850 1199 1714  885 1006]
 [1183  634  838  579  159  953 1471  511  445 1724 1144  928  322  372
  1084 1574  696 1716 1292  600]
 [ 825  770 1052  249 1215 1097 1594 1700  829 1429 1318 1423  358 1207
   222  236 1396 1620 1765 1145]
 [1011  985  597 1262  849 1161 1069  619   46 1969 1510 1298 1384 1273
   922  604  231  360 1452  132]
 [1335   51 1630  630  846 1700 1118 1088 1693  728  249 1429 1533  600
  1292 1943  771 1889 1355 1913]
 [1660  429 1265 1874 1216 1855 1275 1790 1472  458 1693  771 1630  814
   262 1943  623 1701 1584 1815]
 [ 975 1900 1857 1961  685 1249 1949 1270  689 1488 1085  448 1694 1285
   987  409 1363  168  481  932]
 [1283  781 1927  537 1048 1004 1031  923  826 1197  419  403  727 1834
   208 1348 1064    3  245 1896]
 [1700  940 1426 1429  845 1292  351 1479 1818 1355 163

### Format submission

In [20]:
def format_submission(preds, out_name):
    out = []
    for pred in preds:
        pred = [str(iid) + '.jpg' for iid in pred]
        out.append(' '.join(pred))
    print(out[:10])

    out_files = []
    for i in range(2000):
        out_files.append(str(i)+'.txt')
    with open(out_name, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['Descritpion_ID', 'Top_20_Image_IDs'])
        writer.writerows(zip(out_files, out))
    print('Submission:', out_name)
                         

In [21]:
format_submission(preds, 'pls_noun_fc1000_400c.csv')

['1161.jpg 922.jpg 1629.jpg 1273.jpg 1011.jpg 1969.jpg 619.jpg 1510.jpg 1480.jpg 112.jpg 1028.jpg 1276.jpg 700.jpg 1284.jpg 1535.jpg 95.jpg 1045.jpg 1069.jpg 1868.jpg 1181.jpg', '331.jpg 1559.jpg 1733.jpg 179.jpg 1742.jpg 428.jpg 144.jpg 1824.jpg 80.jpg 1871.jpg 1029.jpg 1601.jpg 571.jpg 529.jpg 1806.jpg 850.jpg 1199.jpg 1714.jpg 885.jpg 1006.jpg', '1183.jpg 634.jpg 838.jpg 579.jpg 159.jpg 953.jpg 1471.jpg 511.jpg 445.jpg 1724.jpg 1144.jpg 928.jpg 322.jpg 372.jpg 1084.jpg 1574.jpg 696.jpg 1716.jpg 1292.jpg 600.jpg', '825.jpg 770.jpg 1052.jpg 249.jpg 1215.jpg 1097.jpg 1594.jpg 1700.jpg 829.jpg 1429.jpg 1318.jpg 1423.jpg 358.jpg 1207.jpg 222.jpg 236.jpg 1396.jpg 1620.jpg 1765.jpg 1145.jpg', '1011.jpg 985.jpg 597.jpg 1262.jpg 849.jpg 1161.jpg 1069.jpg 619.jpg 46.jpg 1969.jpg 1510.jpg 1298.jpg 1384.jpg 1273.jpg 922.jpg 604.jpg 231.jpg 360.jpg 1452.jpg 132.jpg', '1335.jpg 51.jpg 1630.jpg 630.jpg 846.jpg 1700.jpg 1118.jpg 1088.jpg 1693.jpg 728.jpg 249.jpg 1429.jpg 1533.jpg 600.jpg 1292.jpg 1

### Pairwise distance 2000 * 2000

In [11]:
from sklearn.metrics.pairwise import euclidean_distances as ed
dist_fc = ed(pls_noun_fc1000.predict(tfidf_noun_test_PCA), sorted_fc1000_test)
print(dist_fc[:5])
print(dist_fc.shape)
np.save('dist_fc', dist_fc)

[[ 55.78720095  79.36845265  49.34022448 ...  85.9364658   75.19343979
   93.23058402]
 [ 74.2828085   85.92724626  71.88138139 ... 100.69292505  88.54927705
   50.86052267]
 [ 72.48326068  79.35288668  81.50713678 ...  64.38733666  50.45962306
  115.50852714]
 [ 66.90411366  60.35452377  66.04305061 ...  63.36043533  41.21986485
   94.82950316]
 [ 53.39099784  62.9862572   48.22853323 ...  69.91044423  55.18732816
   83.12352766]]
(2000, 2000)
