In [22]:
from pprint import pprint
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
from autocorrect import spell
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import NearestNeighbors as KNN
import csv
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
import pickle
from sklearn.metrics.pairwise import euclidean_distances as ed

### Load TFIDF all

In [13]:
def process_corpus(train_path, test_path, out_name, isNoun):
    tokenizer = RegexpTokenizer(r'\w+')
    lmtzr = WordNetLemmatizer()
    
    corpus = []
    for i in range(10000):
        desc_file = open(train_path + str(i) + '.txt', 'r')
        desc = ' '.join(desc_file.readlines())

        tokens = tokenizer.tokenize(desc)
        wordtags = pos_tag(tokens)
        
        if isNoun:
            nouns = [word.lower() for word, pos in wordtags if (pos == 'NN')]
        else:
            nouns = [spell(token.lower()) for token in tokens]

        nouns = [lmtzr.lemmatize(noun, "v") for noun in nouns]
        nouns = [lmtzr.lemmatize(noun, "n") for noun in nouns]
        nouns = [lmtzr.lemmatize(noun, "a") for noun in nouns]
        nouns = [lmtzr.lemmatize(noun, "r") for noun in nouns]
        nouns = [noun for noun in nouns if noun not in stopwords.words('english')]
        corpus.append(' '.join(nouns)) 

    print(corpus[:5])   
    print('Done processing training descriptions.')
    
    query_corpus = []
    for i in range(2000): 
        query_file = open(test_path + str(i) + '.txt', 'r')
        query_desc = ' '.join(query_file.readlines())

        query_tokens = tokenizer.tokenize(query_desc)
        query_wordtags = pos_tag(query_tokens)
        
        if isNoun:
            query_nouns = [word.lower() for word, pos in query_wordtags if (pos == 'NN')]
        else:
            query_nouns = [spell(token.lower()) for token in query_tokens]

        query_nouns = [lmtzr.lemmatize(noun, "v") for noun in query_nouns]
        query_nouns = [lmtzr.lemmatize(noun, "n") for noun in query_nouns]
        query_nouns = [lmtzr.lemmatize(noun, "a") for noun in query_nouns]
        query_nouns = [lmtzr.lemmatize(noun, "r") for noun in query_nouns]
        query_nouns = [noun for noun in query_nouns if noun not in stopwords.words('english')]
        query_corpus.append(' '.join(query_nouns))
    
    print(query_corpus[:5])
    print('Done processing query descriptions.')
    
    corpus_all = corpus + query_corpus
    print('Merged.')
    
    np.save(out_name, corpus_all)
    print('Saved corpus to ' + out_name + '.')

In [14]:
def process_tfidf(corpus, out_name):
    cv = CountVectorizer(min_df = 3)
    X_all_bow = cv.fit_transform(corpus).toarray()
    vocab = np.array(cv.get_feature_names())
    transformer = TfidfTransformer()
    X_all_tfidf = transformer.fit_transform(X_all_bow).toarray()

    print(vocab)
    print('vocab.shape:', vocab.shape)
    print(X_all_tfidf[:10])
    
    np.save(out_name, np.asarray(X_all_tfidf))
    print('Saved TFIDF to ' + out_name + '.')

In [23]:
tfidf_path = 'preprocessing/X_all_tfidf.npy'
if os.path.exists(tfidf_path):
    tfidf_all = np.load(tfidf_path)
    print('Loaded TFIDF from file.')
    print(tfidf_all[:5])
else:
    corpus_all = process_corpus('data/descriptions_train/', 'data/descriptions_test/', 'corpus_all', False)
    tfidf_all = process_tfidf(corpus_all, 'X_all_tfidf')

tfidf_train = tfidf_all[:10000]
tfidf_test = tfidf_all[10000:]
print(tfidf_train.shape, tfidf_test.shape)

Loaded TFIDF from file.
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(10000, 3311) (2000, 3311)


### Load pool5

In [8]:
def process_imgf(in_name, out_name):
    imgf = {}
    csvfile = open(in_name, 'r')
    lines = csvfile.readlines()
    for line in lines:
        iid = int(line.split(",")[0].split("/")[1].split(".")[0])
        imgf[iid] = np.asarray([float(s) for s in line.split(",")[1:]])    

    sorted_imgf = np.asarray([imgf[key] for key in sorted(imgf.keys())])
    np.save(out_name, sorted_imgf)
    print('Sorted ' + in_name + ' saved to ' + out_name + '.')
    
    return sorted_imgf

In [12]:
if os.path.exists('preprocessing/sorted_pool5.npy'):
    sorted_pool5 = np.load('preprocessing/sorted_pool5.npy')
    print('Loaded sorted_pool5 from file.')
else:
    sorted_pool5 = process_imgf('data/features_train/features_resnet1000intermediate_train.csv', 'sorted_pool5')

if os.path.exists('preprocessing/sorted_pool5_test.npy'):
    sorted_pool5_test = np.load('preprocessing/sorted_pool5_test.npy')
    print('Loaded sorted_pool5_test from file.')
else:
    sorted_pool5_test = process_imgf('data/features_test/features_resnet1000intermediate_test.csv', 'sorted_pool5_test')

Loaded sorted_pool5 from file.
Loaded sorted_pool5_test from file.


### Load PLSR 2048

In [16]:
def fit_plsr(train, preds, n_components, max_iter, out_name):
    plsr = PLSRegression(n_components=n_components, max_iter=max_iter)
    plsr.fit(train, pred)
    print('Done fitting PLSR.')
    pickle.dump(plsr, open(out_name, 'wb'))
    print('Saved PLSR ' + str(n_components) + ' to ' + out_name + '.')


In [17]:
plsr_path = 'models/pls_all_pool5_2048c.sav'
if os.path.exists(plsr_path):
    with open(plsr_path, 'rb') as f:
        pls_all_pool5 = pickle.load(f)
        print('Loaded PLSR from ' + plsr_path + '.')

Loaded PLSR from models/pls_all_pool5_2048c.sav.


### KNN 20

In [18]:
near = KNN(n_neighbors = 20, metric = 'cosine').fit(sorted_pool5_test)
preds = near.kneighbors(pls_all_pool5.predict(tfidf_test), return_distance = False)
print(preds[:10])
print(preds.shape)

[[1161 1380  329  714    3  760  290  153  105 1011  985  540  931  231
    46 1479 1982  683 1107 1367]
 [1989 1714 1907  355  674  331 1871  820 1615 1134 1559 1742  997  107
  1199   57  428  482 1068  833]
 [ 634  838 1183 1292 1724 1471  159   42  904  511  781  445  598  600
  1896  953  696 1866  979  249]
 [ 770 1514 1620  829  469  799 1513   75 1315 1145 1207 1486 1609  451
  1396 1035 1727   26 1404 1414]
 [ 808 1384  112  608 1161  849  150 1629 1606  216 1273 1597  956 1603
  1446 1015  206 1913   21   46]
 [1335  771 1693 1145 1292 1837 1943  814   51  249 1913 1630  846 1088
    77  330  492  829 1429 1700]
 [1855 1701 1660 1275 1216  414 1472  583  429 1151  655 1311  262 1289
   718  217 1815 1377  996 1458]
 [ 975 1857  689 1961  685 1488 1249 1949 1694 1085 1302 1270  448  481
   987  168 1328 1239 1285  528]
 [1446  537  781  923  775  597 1700 1004 1927 1283  361 1779 1228  659
   719  132    3 1261 1292  103]
 [1913 1837 1145 1446 1700 1429 1943  845   77 1292 133

### Format submission

In [20]:
def format_submission(preds, out_name):
    out = []
    for pred in preds:
        pred = [str(iid) + '.jpg' for iid in pred]
        out.append(' '.join(pred))
    print(out[:10])

    out_files = []
    for i in range(2000):
        out_files.append(str(i)+'.txt')
    with open(out_name, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['Descritpion_ID', 'Top_20_Image_IDs'])
        writer.writerows(zip(out_files, out))
    print('Submission:', out_name)
                         

In [21]:
format_submission(preds, 'pls_all_pool5_2048c.csv')

['1161.jpg 1380.jpg 329.jpg 714.jpg 3.jpg 760.jpg 290.jpg 153.jpg 105.jpg 1011.jpg 985.jpg 540.jpg 931.jpg 231.jpg 46.jpg 1479.jpg 1982.jpg 683.jpg 1107.jpg 1367.jpg', '1989.jpg 1714.jpg 1907.jpg 355.jpg 674.jpg 331.jpg 1871.jpg 820.jpg 1615.jpg 1134.jpg 1559.jpg 1742.jpg 997.jpg 107.jpg 1199.jpg 57.jpg 428.jpg 482.jpg 1068.jpg 833.jpg', '634.jpg 838.jpg 1183.jpg 1292.jpg 1724.jpg 1471.jpg 159.jpg 42.jpg 904.jpg 511.jpg 781.jpg 445.jpg 598.jpg 600.jpg 1896.jpg 953.jpg 696.jpg 1866.jpg 979.jpg 249.jpg', '770.jpg 1514.jpg 1620.jpg 829.jpg 469.jpg 799.jpg 1513.jpg 75.jpg 1315.jpg 1145.jpg 1207.jpg 1486.jpg 1609.jpg 451.jpg 1396.jpg 1035.jpg 1727.jpg 26.jpg 1404.jpg 1414.jpg', '808.jpg 1384.jpg 112.jpg 608.jpg 1161.jpg 849.jpg 150.jpg 1629.jpg 1606.jpg 216.jpg 1273.jpg 1597.jpg 956.jpg 1603.jpg 1446.jpg 1015.jpg 206.jpg 1913.jpg 21.jpg 46.jpg', '1335.jpg 771.jpg 1693.jpg 1145.jpg 1292.jpg 1837.jpg 1943.jpg 814.jpg 51.jpg 249.jpg 1913.jpg 1630.jpg 846.jpg 1088.jpg 77.jpg 330.jpg 492.jpg 829

### Pairwise distance 2000 * 2000

In [9]:
dist_pool = ed(pls_all_pool5.predict(tfidf_test), sorted_pool5_test)
print(dist_pool[:5])
print(dist_pool.shape)
np.save('dist_pool', dist_pool)

[[19.64242728 22.74472159 18.80300624 ... 21.92569311 21.69261115
  25.36181337]
 [25.86516698 27.40234023 27.99033553 ... 29.47187955 28.39413123
  23.46640576]
 [22.62722082 22.1714028  23.53803141 ... 21.78323414 18.3948048
  28.21897581]
 [22.53819982 18.96064358 22.29525381 ... 21.57259255 16.73993491
  26.92714481]
 [23.16323099 24.0392422  21.80058644 ... 24.55130498 23.17148436
  26.97140383]]
(2000, 2000)
