In [1]:
import csv
import math
import string
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from PIL import Image
from IPython.display import display, Markdown

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import svm
from sklearn import preprocessing
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import NearestNeighbors as KNN
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Perceptron

from nltk.stem import PorterStemmer
import nltk
from nltk.corpus import stopwords

import gensim

import torch
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.utils.data as Data
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from torch.autograd import Variable

import utils.BoW_data as dp

In [2]:
ddir = 'data/'

In [3]:
mlb_tags_train, mlb_tags_test, df = dp.load_dataframe()

In [None]:
# Have to do this because load_dataframe() is messing up resnet_vectors somehow
with open('data/features_train/features_resnet1000_train.csv', 'r') as fp:
    features_train = [x.strip().split(',') for x in fp.readlines()]
    
features_train = {x[0]: np.array([np.float(n) for n in x[1:]]) for x in features_train}

with open('data/features_test/features_resnet1000_test.csv', 'r') as fp:
    features_test = [x.strip().split(',') for x in fp.readlines()]
    
features_test = {x[0]: np.array([np.float(n) for n in x[1:]]) for x in features_test}

In [None]:
# Have to do this because load_dataframe() is messing up resnet_vectors somehow
with open('data/features_train/features_resnet1000intermediate_train.csv', 'r') as fp:
    int_features_train = [x.strip().split(',') for x in fp.readlines()]
    
int_features_train = {x[0]: np.array([np.float(n) for n in x[1:]]) for x in int_features_train}

# Have to do this because load_dataframe() is messing up resnet_vectors somehow
with open('data/features_test/features_resnet1000intermediate_test.csv', 'r') as fp:
    int_features_test = [x.strip().split(',') for x in fp.readlines()]
    
int_features_test = {x[0]: np.array([np.float(n) for n in x[1:]]) for x in int_features_test}

# W2V

In [6]:
df['desc_joined'] = df['descriptions'].apply(lambda x: " ".join(x).split(" "))

In [7]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./models/GoogleNews-vectors-negative300.bin', binary=True)

In [8]:
ps = PorterStemmer()

stop_words = set(stopwords.words('english'))
stop_words.remove('not')

In [9]:
def get_word2vec(df_in, model):
    
    def prep_doc(doc):

        # Stem and lower all the words
        sentence = [
            ps.stem(word).lower() for word in doc
        ]

        # Remove trailing commas and periods
        sentence = [
            "".join([char for char in word if char not in string.punctuation]) for word in sentence
        ]

        # Remove stopwords, remove any blank words 
        sentence = [
            word for word in sentence if word not in stop_words and word not in  [" ", ""]
        ]
        
        return sentence
    
    def try_get_vec(word):
        try:
            word = model.get_vector(word)
            can_do = True
        except:
            word = np.zeros(300)
            can_do = False
        return can_do
    
    def get_vector(word):
        try:
            word = model.get_vector(word)
        except:
            word = np.zeros(300)
        return word
    
    stops = set(stopwords.words('english'))
    vecs = df_in['desc_joined'].apply(
        lambda desc: np.average([
            get_vector(word) for word in prep_doc(desc)
            if try_get_vec(word)
        ], axis=0)
    )
    
    return np.asarray([vec for vec in vecs], dtype=np.float64)

# BERT

In [10]:
# Return train and test dicts for looking up BERT vector from filename

bert_train = torch.load(ddir + 'bert/bert_train_avg_last_4.pt')
bert_test = torch.load(ddir + 'bert/bert_test_avg_last_4.pt')

for root, directory, files in os.walk(os.path.join('data/','descriptions_train')):
    bert_train_filenames = files
    
for root, directory, files in os.walk(os.path.join('data/','descriptions_test')):
    bert_test_filenames = files
    
bert_train_lookup = {bert_train_filenames[ind]: bert_vector.numpy() for ind, bert_vector in enumerate(bert_train)}
bert_test_lookup = {bert_test_filenames[ind]: bert_vector.numpy() for ind, bert_vector in enumerate(bert_test)}

In [10]:
df['text_file'] = df['image_file'].apply(lambda x: x.split('/')[-1].replace('jpg', 'txt'))
df.head()

Unnamed: 0,image_file,resnet_vector,descriptions,tags,tags_values,tags_vec,word_list,word_vector,desc_joined,text_file
0,images_train/5373.jpg,"[-0.8994496464729309, -0.9304700493812561, -2....","[a red train is docked at the station, Several...","[vehicle:train, person:person, indoor:clock, a...","[train, person, clock, handbag]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[red, train, dock, station, sever, peopl, stan...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[a, red, train, is, docked, at, the, station, ...",5373.txt
1,images_train/984.jpg,"[-1.3469539880752563, -3.1194605827331543, -0....",[A man with blue jersey holding a baseball bat...,"[person:person, sports:baseball bat]","[person, baseball bat]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[A, man, blue, jersey, hold, basebal, bat, clo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[A, man, with, blue, jersey, holding, a, baseb...",984.txt
2,images_train/7127.jpg,"[-3.44549822807312, -1.5245732069015503, -1.00...",[A kitchen decorated in red and white with acc...,"[appliance:refrigerator, appliance:oven, appli...","[refriger, oven, sink, cup, cake, vase]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[A, kitchen, decor, red, white, accessori, A, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[A, kitchen, decorated, in, red, and, white, w...",7127.txt
3,images_train/9609.jpg,"[1.1146496534347534, -2.1671018600463867, 0.09...",[A black and white dog chasing sheep in a fiel...,"[animal:dog, animal:sheep]","[dog, sheep]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[A, black, white, dog, chase, sheep, field, sm...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[A, black, and, white, dog, chasing, sheep, in...",9609.txt
4,images_train/5293.jpg,"[1.6026496887207031, -1.5058174133300781, 3.02...",[Two bears with their mouths open in the water...,[animal:bear],[bear],"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[two, bear, mouth, open, water, A, coupl, bear...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Two, bears, with, their, mouths, open, in, th...",5293.txt


# Training / test data prep

In [39]:
# X = np.asarray([bert_train_lookup[file] for file in df['text_file']], dtype=np.float64)
X = get_word2vec(df, w2v_model)
# y = df['resnet_vector'].to_numpy()
y = np.asarray([int_features_train[x] for x in df['image_file']], dtype=np.float64)

y_true = df['image_file'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# KNN Classifier

In [None]:
neigh = KNN(n_neighbors=3)
clf.fit(X_train, y_train)

# Random Forest Regressor

In [230]:
clf = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=10)
clf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [231]:
clf.score(X_test, y_test)



0.1361870464321501

# Lasso Regression

In [12]:
clf = Lasso(alpha=0.1)
clf.fit(X_train, y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [13]:
clf.score(X_test, y_test)



0.006503694181384273

# Linear Regression

In [57]:
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [58]:
clf.score(X_test, y_test)



-0.1100622412332056

# SKLearn NN

At first tried with final layer at 512 but it was not converging, probably due to compression of the high dimensionality output vector which is 1000

Using final feature vecs with layers [2048, 1024] => 0.41

Need to try with intermediate feature vector, shouldnt be too hard to pull out

In [40]:
solvers = ['lbfgs', 'sgd', 'adam']

nn = MLPRegressor(
    hidden_layer_sizes=(512, 1024, 2048),
    max_iter=500,
    solver='adam',
    activation='relu',
    verbose=True
)

In [41]:
nn.fit(X_train, y_train)

Iteration 1, loss = 0.10001750
Iteration 2, loss = 0.08059242
Iteration 3, loss = 0.07116178
Iteration 4, loss = 0.06654204
Iteration 5, loss = 0.06339833
Iteration 6, loss = 0.06135377
Iteration 7, loss = 0.05994939
Iteration 8, loss = 0.05865161
Iteration 9, loss = 0.05769213
Iteration 10, loss = 0.05686973
Iteration 11, loss = 0.05606289
Iteration 12, loss = 0.05523872
Iteration 13, loss = 0.05464951
Iteration 14, loss = 0.05391150
Iteration 15, loss = 0.05342735
Iteration 16, loss = 0.05283329
Iteration 17, loss = 0.05230680
Iteration 18, loss = 0.05179332
Iteration 19, loss = 0.05126710
Iteration 20, loss = 0.05073719
Iteration 21, loss = 0.05016288
Iteration 22, loss = 0.04974743
Iteration 23, loss = 0.04919897
Iteration 24, loss = 0.04881318
Iteration 25, loss = 0.04833816
Iteration 26, loss = 0.04768792
Iteration 27, loss = 0.04720919
Iteration 28, loss = 0.04682583
Iteration 29, loss = 0.04646709
Iteration 30, loss = 0.04600533
Iteration 31, loss = 0.04540529
Iteration 32, los

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(512, 1024, 2048), learning_rate='constant',
             learning_rate_init=0.001, max_iter=500, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=True, warm_start=False)

In [None]:
cvnn = cross_val_score(
    nn,
    X,
    y, 
    cv=3
)

Iteration 1, loss = 0.10213039
Iteration 2, loss = 0.08289681
Iteration 3, loss = 0.07352271
Iteration 4, loss = 0.06841964
Iteration 5, loss = 0.06490570
Iteration 6, loss = 0.06271097
Iteration 7, loss = 0.06090369
Iteration 8, loss = 0.05970405
Iteration 9, loss = 0.05848295
Iteration 10, loss = 0.05753701
Iteration 11, loss = 0.05669773
Iteration 12, loss = 0.05605665
Iteration 13, loss = 0.05534638
Iteration 14, loss = 0.05468103
Iteration 15, loss = 0.05414915
Iteration 16, loss = 0.05361813
Iteration 17, loss = 0.05305640
Iteration 18, loss = 0.05247406
Iteration 19, loss = 0.05196433
Iteration 20, loss = 0.05138178
Iteration 21, loss = 0.05087186
Iteration 22, loss = 0.05031324
Iteration 23, loss = 0.04983056
Iteration 24, loss = 0.04954192
Iteration 25, loss = 0.04884425
Iteration 26, loss = 0.04843693
Iteration 27, loss = 0.04820687
Iteration 28, loss = 0.04747635
Iteration 29, loss = 0.04711037
Iteration 30, loss = 0.04662736
Iteration 31, loss = 0.04619926




Iteration 1, loss = 0.10160625
Iteration 2, loss = 0.08326778
Iteration 3, loss = 0.07366414
Iteration 4, loss = 0.06833266


In [None]:
cvnn

In [42]:
nn.score(X_test, y_test)



0.08872857138188246

12:35 - 0.5231418465059838

Adam  - 0.25864671144762147

# Pytorch NN

In [263]:
class Net(torch.nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.hidden1 = torch.nn.Linear(300, 512)
        self.hidden2 = torch.nn.Linear(512, 1024)
        self.predict = torch.nn.Linear(1024, 1000)

    def forward(self, x):
        x = F.relu(self.hidden1(x))
        x = F.relu(self.hidden2(x))
        x = self.predict(x) 
        return x


net = Net()
print(net)

optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
loss_func = torch.nn.MSELoss()

BATCH_SIZE = 64
EPOCH = 200

torch_dataset = Data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))

loader = Data.DataLoader(
    dataset=torch_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, num_workers=2,)

Net(
  (hidden1): Linear(in_features=300, out_features=512, bias=True)
  (hidden2): Linear(in_features=512, out_features=1024, bias=True)
  (predict): Linear(in_features=1024, out_features=1000, bias=True)
)


In [266]:
for epoch in range(EPOCH):
    
    for step, (batch_x, batch_y) in enumerate(loader):

        b_x = Variable(batch_x)
        b_y = Variable(batch_y)

        prediction = net(b_x)

        loss = loss_func(prediction, b_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

RuntimeError: Expected object of scalar type Float but got scalar type Double for argument #2 'mat1' in call to _th_addmm

# Model Validation

In [43]:
model = nn

In [44]:
def score_prediction(true, pred):

    if true not in pred:
        return 0.0
    
    ind = pred.index(true)
    
    return (20 - ind) / 20

In [45]:
def nearest_neighbors(sample, targets):
    """Returns index positions of nearest neighbors"""
    distances = [np.linalg.norm(t-sample) for t in targets]
    distances_sorted = np.argsort(distances)
    return distances_sorted[:20]

In [46]:
predicted_fvs = [model.predict(desc.reshape(1, -1)) for desc in X_test]
pred_indices = [nearest_neighbors(sample, y_test) for sample in predicted_fvs]

preds = [
    [y_true[ind] for ind in pred]
    for pred in pred_indices
]

# score = sum([1 if y_true[ind] in pred else 0 for ind, pred in enumerate(preds)]) / len(preds)

In [47]:
score = sum(
    [score_prediction(y_true[ind], pred) for ind, pred in enumerate(preds)]
           ) / len(preds)

In [48]:
print(score)

0.3934250000000005


12:35 score - 0.37332099999999513

random forest - 0.0018503043118560125

# Making predictions

In [31]:
model = nn

In [32]:
mlb_tags_train2, mlb_tags_test2, df_test = dp.load_dataframe(train_or_test='test')

df_test['desc_joined'] = df_test['descriptions'].apply(lambda x: " ".join(x).split(" "))

In [206]:
df_test.head()

Unnamed: 0,image_file,resnet_vector,descriptions,tags,tags_values,tags_vec,word_list,word_vector,desc_joined
0,images_test/152.jpg,"[-0.148756742477417, -0.4813389182090759, -0.6...","[A sign sitting above a store front entrance.,...",[person:person],[person],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[A, sign, sit, store, front, entranc, small, a...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[A, sign, sitting, above, a, store, front, ent..."
1,images_test/901.jpg,"[-2.7743351459503174, -2.609677791595459, -2.2...",[A steam locamotive passes by some houses on a...,[indoor:clock],[clock],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[A, steam, locamot, pass, hous, track, A, smal...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[A, steam, locamotive, passes, by, some, house..."
2,images_test/1609.jpg,"[-4.7030134201049805, -3.1210086345672607, -0....",[a woman with purple hair is taking a picture ...,"[animal:dog, animal:horse, person:person]","[dog, hors, person]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[woman, purpl, hair, take, pictur, A, hipster,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[a, woman, with, purple, hair, is, taking, a, ..."
3,images_test/501.jpg,"[-4.7203192710876465, -3.283935546875, -3.4253...",[A GREEN AND BROWN SUITCASE LYING UP ON A LEDG...,[furniture:toilet],[toilet],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[A, green, and, brown, suitcas, ly, UP, ON, A,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[A, GREEN, AND, BROWN, SUITCASE, LYING, UP, ON..."
4,images_test/517.jpg,"[-2.5421407222747803, 0.7139833569526672, -4.4...",[A desk with a small white computer set up on ...,"[animal:cat, kitchen:bowl, furniture:bed]","[cat, bowl, bed]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, ...","[A, desk, small, white, comput, set, A, desk, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[A, desk, with, a, small, white, computer, set..."


In [207]:
test_descriptions = get_word2vec(df_test, w2v_model)

y_resnet = np.asarray([int_features_test[x] for x in df_test['image_file']], dtype=np.float64)
y_targets = df_test['image_file'].to_numpy()

In [208]:
predicted_fvs = [model.predict(desc.reshape(1, -1)) for desc in test_descriptions]
pred_indices = [nearest_neighbors(sample, y_resnet) for sample in predicted_fvs]

preds = [
    [y_targets[ind] for ind in pred]
    for pred in pred_indices
]

preds = [[file.split('/')[-1] for file in pred] for pred in preds]

In [209]:
desc_txt_files = df_test['image_file'].apply(
        lambda x: x.split('/')[-1].replace('jpg', 'txt')
    )

In [210]:
out_preds = list(zip(desc_txt_files.to_numpy(), preds))

In [211]:
with open('sick_nn_preds.csv', 'w+') as f:
    f.write('Descritpion_ID,Top_20_Image_IDs\n')
    for pred in out_preds:
        f.write(f"{pred[0]},{' '.join(pred[1])}\n")