In [None]:
!ls data/features_train

In [291]:
import os

import numpy as np
import pandas as pd

import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score as cv
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression as Logit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

from nltk.corpus import stopwords

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import utils.data_parsers as dp

import random

In [2]:
df = dp.load_dataframe()

In [208]:
with open('data/features_train/features_resnet1000_train.csv', 'r') as fp:
    features = [x.strip().split(',') for x in fp.readlines()]
    
features = {x[0]: np.array([float(n) for n in x[1:]]) for x in features}

In [13]:
df['desc_joined'] = df['descriptions'].apply(lambda x: " ".join(x).split(" "))
df['desc_joined'] = df['desc_joined'].apply(lambda x: [word for word in x if word not in df.columns])

In [238]:
test_df = dp.load_dataframe().drop(columns=['descriptions'])

with open('data/features_test/features_resnet1000_test.csv', 'r') as fp:
    test_features = [x.strip().split(',') for x in fp.readlines()]
    
test_features = {x[0]: np.array([float(n) for n in x[1:]]) for x in test_features}

### Applying naive bayes to BOW representation of description

In [14]:
mlb = MultiLabelBinarizer()
one_hot_encodings = mlb.fit_transform(df['desc_joined'])
one_hot_columns = mlb.classes_

train_one_hot = df.join(
    pd.DataFrame(
        one_hot_encodings,
        columns=one_hot_columns,
        index=df.index
    ))

train_one_hot = train_one_hot

train_one_hot.head()

Unnamed: 0,image_file,resnet_vector,descriptions,tags,desc_joined,Unnamed: 6,"""","""1802"".","""3-way""","""30",...,zoo,"zoo,",zoo.,zoo..,zookeeper,zoom,zucchini,"zucchini,",zucchini.,"zucchinis,"
0,images_train/5373.jpg,"[-0.8994496464729309, -0.9304700493812561, -2....","[a red train is docked at the station, Several...","[vehicle:train, person:person, indoor:clock, a...","[a, red, train, is, docked, at, the, station, ...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,images_train/984.jpg,"[-1.3469539880752563, -3.1194605827331543, -0....",[A man with blue jersey holding a baseball bat...,"[person:person, sports:baseball bat]","[A, man, with, blue, jersey, holding, a, baseb...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,images_train/7127.jpg,"[-3.44549822807312, -1.5245732069015503, -1.00...",[A kitchen decorated in red and white with acc...,"[appliance:refrigerator, appliance:oven, appli...","[A, kitchen, decorated, in, red, and, white, w...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,images_train/9609.jpg,"[1.1146496534347534, -2.1671018600463867, 0.09...",[A black and white dog chasing sheep in a fiel...,"[animal:dog, animal:sheep]","[A, black, and, white, dog, chasing, sheep, in...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,images_train/5293.jpg,"[1.6026496887207031, -1.5058174133300781, 3.02...",[Two bears with their mouths open in the water...,[animal:bear],"[Two, bears, with, their, mouths, open, in, th...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Word2Vec Model Building

In [239]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./models/GoogleNews-vectors-negative300.bin', binary=True)

In [283]:
def get_word2vec(df, model):
    
    def get_vector(word):
        try:
            word = model.get_vector(word)
        except:
            word = np.zeros(300)
        return word
    
    stops = set(stopwords.words('english'))
    vecs = df['descriptions'].apply(
        lambda x: np.average([
            get_vector(word) for doc in x for word in doc if word not in stops and word != ' '
        ], axis=0)
    )
    
    return vecs

### Applying and testing word2vec

In [284]:
# X = get_word2vec(df, w2v_model).to_numpy()
X = np.array([x for x in get_word2vec(df, w2v_model).to_numpy()])
## below code commented as for some reason feature vectors broken in df, not sure why
# Y = df['resnet_vector']
Y = np.array([features[x] for x in df['image_file']])

X_train, X_test, y_train, y_test = train_test_split(X, Y)

In [285]:
X_train

array([[-0.11532777,  0.11567486, -0.01257765, ..., -0.0095406 ,
        -0.15625956,  0.14615052],
       [-0.19988748,  0.11520342,  0.00887221, ..., -0.04419034,
        -0.14595894,  0.1737149 ],
       [-0.15152457,  0.13544861, -0.01098431, ..., -0.02271819,
        -0.15403054,  0.17622918],
       ...,
       [-0.1399013 ,  0.09768375, -0.0151166 , ..., -0.03291606,
        -0.13908235,  0.14061293],
       [-0.16748275,  0.08394502,  0.01038967, ..., -0.02948854,
        -0.16070899,  0.16407163],
       [-0.12525883,  0.07752539, -0.00024527, ..., -0.02982698,
        -0.1460543 ,  0.15044714]])

### LinearRegression

In [286]:
linr = LinearRegression()
linr.fit(X_train, y_train)
linr.score(X_test, y_test)



-29642800.424634855

In [287]:
cv(linr, X, Y)



array([-4.50647450e+07, -2.31554854e+07, -2.92848138e+17])

### RFRegression

In [None]:
fregr = RandomForestRegressor(max_depth=10, random_state=0, n_estimators=100)
fregr.fit(X_train, y_train)
fregr.score(X_train, y_train)

# Doc2Vec Model Building

In [38]:
descriptions_flattened = [doc.split(" ") for i, docs in enumerate(df['descriptions']) for doc in docs]

In [72]:
train_desc, test_desc = train_test_split(descriptions_flattened)

train_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_desc)]
test_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(test_desc)]

d2v_model = Doc2Vec(vector_size=300, min_count=1, epochs=40)

d2v_model.build_vocab(train_docs)

d2v_model.train(train_docs, total_examples=d2v_model.corpus_count)

In [86]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_docs) - 1)
inferred_vector = d2v_model.infer_vector(test_docs[doc_id].words)
sims = d2v_model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_docs[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % d2v_model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_docs[sims[index][0]].words)))

Test Document (8715): «a group of elephants are gathered together outside»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d300,n5,w5,s0.001,t3):

MOST (18590, 0.8614320755004883): «A herd of cattle grazing outside a barn.»

MEDIAN (13897, 0.264367938041687): «many different animals in a large field of green grass»

LEAST (37330, -0.36507540941238403): «A white clock tower at the top of a tiled building.»



In [92]:
inferred_vector = model.infer_vector(['a', 'child', 'holding', 'an', 'umbrella'])
train_docs[model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))[0][0]]

TaggedDocument(words=['A', 'child', 'looks', 'at', 'a', 'giant', 'birthday', 'cake'], tags=[20920])

### Applying and testing doc2vec

In [None]:
X = np.array([np.sum([d2v_model.infer_vector(doc.split(" ")) for doc in docs], axis=0) for docs in df['descriptions']])
## below code commented as for some reason feature vectors broken in df, not sure why
# Y = df['resnet_vector']
Y = np.array([features[x] for x in df['image_file']])

X_train, X_test, y_train, y_test = train_test_split(X, Y)

### Linear Regression

In [233]:
linr = LinearRegression()
linr.fit(X_train, y_train)
linr.score(X_test, y_test)



0.3396646796641405

In [230]:
cv(linr, X, Y)



array([0.33346605, 0.31798571, 0.32178002])

### Random Forest Regression

In [402]:
fregr = RandomForestRegressor(max_depth=10, random_state=0, n_estimators=100)
fregr.fit(X_train, y_train)
fregr.score(X_test, y_test)

KeyboardInterrupt: 

In [231]:
# cv(fregr, X, Y)



KeyboardInterrupt: 

# Testing model performance against training data

In [330]:
def nearest_neighbor(sample, targets):
    """Returns index position of nearest neighbor"""
    distances = [np.linalg.norm(t-sample) for t in targets]
    return np.argmin(distances)

In [309]:
df.head()

Unnamed: 0,image_file,resnet_vector,descriptions,tags,desc_joined
0,images_train/5373.jpg,"[-0.8994496464729309, -0.9304700493812561, -2....","[a red train is docked at the station, Several...","[vehicle:train, person:person, indoor:clock, a...","[a, red, train, is, docked, at, the, station, ..."
1,images_train/984.jpg,"[-1.3469539880752563, -3.1194605827331543, -0....",[A man with blue jersey holding a baseball bat...,"[person:person, sports:baseball bat]","[A, man, with, blue, jersey, holding, a, baseb..."
2,images_train/7127.jpg,"[-3.44549822807312, -1.5245732069015503, -1.00...",[A kitchen decorated in red and white with acc...,"[appliance:refrigerator, appliance:oven, appli...","[A, kitchen, decorated, in, red, and, white, w..."
3,images_train/9609.jpg,"[1.1146496534347534, -2.1671018600463867, 0.09...",[A black and white dog chasing sheep in a fiel...,"[animal:dog, animal:sheep]","[A, black, and, white, dog, chasing, sheep, in..."
4,images_train/5293.jpg,"[1.6026496887207031, -1.5058174133300781, 3.02...",[Two bears with their mouths open in the water...,[animal:bear],"[Two, bears, with, their, mouths, open, in, th..."


In [346]:
def predict_image_file(desc_vector, word_to_fv_model, df):
    predicted_fv = word_to_fv_model.predict(desc_vector.reshape(1, -1))
    targets = df['resnet_vector'].to_numpy()
    pred_index = nearest_neighbor(predicted_fv, targets)
    return df['image_file'].iloc[pred_index]

In [337]:
X = np.array([np.sum([d2v_model.infer_vector(doc.split(" ")) for doc in docs], axis=0) for docs in df['descriptions']])
y_true = df['image_file'].to_numpy()
y_pred = [predict_image_file(sample, linr, df) for sample in X]


# target_features = np.array([features[x] for x in df['image_file']])

# X_train, X_test, y_train, y_test = train_test_split(X, y)

In [344]:
target_features = np.array([features[x] for x in df['image_file']])

X_train, X_test, y_train, y_test = train_test_split(X, y)

NameError: name 'desc' is not defined

In [348]:
y_pred = []
for ind, sample in enumerate(X):
    y_pred.append(predict_image_file(sample, linr, df))
    if ind % 100 == 0:
        print(ind)
y_pred

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700


KeyboardInterrupt: 

# Predicting for test data

In [388]:
def nearest_neighbor_test(sample, targets):
    """Returns index position of nearest neighbor"""
    distances = [np.linalg.norm(t-sample) for t in targets]
    distances_with_indices = list(zip(range(len(distances)), distances))
    top_20 = sorted(distances_with_indices, reverse=False, key=lambda x: x[1])[:20]
    return [x[0] for x in top_20]


def predict_image_file_test(desc_vector, word_to_fv_model, df):
    predicted_fv = word_to_fv_model.predict(desc_vector.reshape(1, -1))
    targets = df['resnet_vector'].to_numpy()
    pred_index = nearest_neighbor_test(predicted_fv, targets)
    return df['image_file'].iloc[pred_index]

In [386]:
test_desc_dir = 'data/descriptions_test/'
test_desc_files = os.listbir(test_desc_dir)

test_descs = []

for file in test_desc_files:
    with open(test_desc_dir + file, 'r') as fp:
        test_descs.append([x.strip() for x in fp.readlines()])

AttributeError: module 'os' has no attribute 'listdbir'

In [296]:
embeddings = np.array([np.sum([d2v_model.infer_vector(doc.split(" ")) for doc in docs], axis=0) for docs in test_descs])

In [297]:
embeddings

array([[-0.1886175 , -0.04468335,  0.19973159, ..., -0.03871573,
         0.30764997,  0.09775054],
       [-0.5272241 ,  0.25031227, -0.4978695 , ...,  0.03143799,
        -0.73605955,  0.02075675],
       [-0.01428354, -0.15657303, -0.21891023, ...,  0.08309557,
         0.2509138 , -0.04662977],
       ...,
       [ 0.01299177,  0.07321606,  0.29401723, ..., -0.14092672,
         0.26749045,  0.16646394],
       [-0.32768595,  0.7706371 , -0.34445158, ...,  0.43872094,
         0.2969781 , -0.17775065],
       [-0.21630043, -0.8644986 ,  0.91080666, ...,  0.2970853 ,
        -0.3720528 , -0.27489305]], dtype=float32)

In [350]:
test_df.head()

Unnamed: 0,image_file,resnet_vector,tags
0,images_train/5373.jpg,"[-0.8994496464729309, -0.9304700493812561, -2....","[vehicle:train, person:person, indoor:clock, a..."
1,images_train/984.jpg,"[-1.3469539880752563, -3.1194605827331543, -0....","[person:person, sports:baseball bat]"
2,images_train/7127.jpg,"[-3.44549822807312, -1.5245732069015503, -1.00...","[appliance:refrigerator, appliance:oven, appli..."
3,images_train/9609.jpg,"[1.1146496534347534, -2.1671018600463867, 0.09...","[animal:dog, animal:sheep]"
4,images_train/5293.jpg,"[1.6026496887207031, -1.5058174133300781, 3.02...",[animal:bear]


In [389]:
X = embeddings
y_pred = [predict_image_file_test(sample, linr, test_df) for sample in X]

target_features = np.array([features[x] for x in df['image_file']])

(0, 563615194911409.9)
(0, 2006998104715309.2)
(0, 42608101804160.27)
(0, 948498853351448.5)
(0, 353002674688074.8)
(0, 233200965268840.0)
(0, 1500857691912692.8)
(0, 576649646863374.0)
(0, 3157176788847471.0)
(0, 1856843408943583.8)
(0, 1609495155949606.5)
(0, 1678019655985998.8)
(0, 99922743046711.31)
(0, 300316780019448.56)
(0, 4856520403909656.0)
(0, 2235700887448620.2)
(0, 3103974857748175.0)
(0, 735288930190957.9)
(0, 290956063166161.0)
(0, 3137711340786042.0)
(0, 245416404697016.4)
(0, 5236436864055780.0)
(0, 1613687319792749.0)
(0, 336731551991642.06)
(0, 1465981482258500.2)
(0, 1535104802057403.5)
(0, 2405592372335417.0)
(0, 52507392065972.26)
(0, 1503260409234905.8)
(0, 3283791243067347.0)
(0, 445046856836016.2)
(0, 2067038528959366.0)
(0, 669041621528381.5)
(0, 1475253828566452.2)
(0, 2589937859158808.0)
(0, 955359081391760.8)
(0, 645789043323380.1)
(0, 905065976921768.6)
(0, 690223777278804.1)
(0, 54157153214139.86)
(0, 451862719444142.5)
(0, 488309612572423.06)
(0, 3389742

(0, 839939878033754.8)
(0, 958824409358307.1)
(0, 589613085002920.1)
(0, 2150208500493538.2)
(0, 2080857658435267.5)
(0, 2220666209653394.8)
(0, 495949760683004.8)
(0, 1354225418741388.2)
(0, 2376883837425757.5)
(0, 144938930127380.78)
(0, 1134589569284008.2)
(0, 426437658256620.6)
(0, 1011362298107620.9)
(0, 1158513602091325.5)
(0, 2239262671391576.0)
(0, 1003527843269805.8)
(0, 1260975868348698.2)
(0, 1391148349570488.2)
(0, 3865297346833305.0)
(0, 1361823664680856.5)
(0, 1263741390329981.5)
(0, 1199865992673593.0)
(0, 2843132215845088.0)
(0, 256334163541372.94)
(0, 2874956354500672.5)
(0, 1506248791009938.2)
(0, 120108200748973.55)
(0, 749150279414446.1)
(0, 407100159415074.75)
(0, 1413620629819160.0)
(0, 2994269374128906.0)
(0, 610569872654807.8)
(0, 548055701096879.7)
(0, 1976771231084455.5)
(0, 711882470698327.0)
(0, 199764165563353.56)
(0, 700835930365915.8)
(0, 2187256955970769.5)
(0, 1645512847923258.8)
(0, 1263239083264084.8)
(0, 293431016532426.1)
(0, 2081471889297441.2)
(0,

(0, 1953466049245875.5)
(0, 1911735962785189.5)
(0, 801774975215874.2)
(0, 1715724499078389.2)
(0, 1683713134167624.5)
(0, 2354916339279578.5)
(0, 306639718983924.0)
(0, 15267159954758.1)
(0, 2241752609943135.8)
(0, 168593117748567.5)
(0, 1153023772561704.8)
(0, 1381386639809339.5)
(0, 1038956606954152.5)
(0, 593947296738814.0)
(0, 1283952767077492.8)
(0, 3677599766296905.5)
(0, 1858695981956396.0)
(0, 1137281396827979.8)
(0, 794998661194048.6)
(0, 65928478908349.414)
(0, 3710279424849928.5)
(0, 634382841388138.4)
(0, 347804319088176.2)
(0, 246216248868054.75)
(0, 349301838251550.25)
(0, 440684856138007.3)
(0, 1053427140820135.9)
(0, 1720462018312207.5)
(0, 150613444935367.9)
(0, 3836041003055496.5)
(0, 1678180075903611.2)
(0, 1429661182569488.0)
(0, 2683012633038913.0)
(0, 1914787691772303.2)
(0, 1616436386307891.8)
(0, 1935908697243447.2)
(0, 193938829186228.94)
(0, 160055341160318.88)
(0, 2342766622362363.0)
(0, 4729153388009599.0)
(0, 983999008801850.4)
(0, 3263291106593967.0)
(0, 

(0, 2322961781184981.0)
(0, 1938610812904699.0)
(0, 1559380015961766.0)
(0, 453816444418092.7)
(0, 3089123283704035.0)
(0, 1266171238550669.5)
(0, 3994912345800067.0)
(0, 2184523015034764.2)
(0, 625137741206814.5)
(0, 2367560241460789.0)
(0, 1293192544641063.0)
(0, 386698946129927.6)
(0, 2707812998240314.5)
(0, 1277731699745064.8)
(0, 1343297383207391.2)
(0, 659276884318212.4)
(0, 3358969900696234.5)
(0, 17143576155061.049)
(0, 642995805302297.2)
(0, 171997360531676.75)
(0, 368732525047093.2)
(0, 314998031069698.56)
(0, 3820563469938802.0)
(0, 479391566634055.2)
(0, 835615252554888.8)
(0, 704357261684424.2)
(0, 933518259087835.1)
(0, 511321331226097.9)
(0, 300385163651496.06)
(0, 2410935676323138.5)
(0, 671327723755951.2)
(0, 4046497123667516.0)
(0, 1909807745029127.8)
(0, 815689628284396.8)
(0, 1309790733469952.2)
(0, 2372529689573586.0)
(0, 2518068718734112.0)
(0, 1078887293245506.2)
(0, 929836468377524.0)
(0, 258347654810804.22)
(0, 3429976476609778.0)
(0, 2657203237905945.5)
(0, 13

(0, 1293837743434103.5)
(0, 2026316950097628.5)
(0, 831582645744411.1)
(0, 1511239673869289.5)
(0, 3314461500792979.0)
(0, 2039936956135617.5)
(0, 1420830531683863.2)
(0, 609693062547990.8)
(0, 1832454335097563.0)
(0, 1216443802216850.8)
(0, 797736367764172.4)
(0, 1473331268064575.0)
(0, 1177356993874727.5)
(0, 5900005736603758.0)
(0, 71736905542322.75)
(0, 1860892142582536.2)
(0, 798661675410518.5)
(0, 3758539300662986.0)
(0, 652856007413377.8)
(0, 3679782523855331.0)
(0, 1444136448513340.5)
(0, 1479496601910461.8)
(0, 1618744962629814.0)
(0, 3166374577221310.0)
(0, 2164591879946349.8)
(0, 1452099789797648.2)
(0, 2482462844486197.0)
(0, 3384443573783651.5)
(0, 4428626663675739.5)
(0, 1009332890482243.0)
(0, 451236792615770.8)
(0, 465472566718315.8)
(0, 4933250059605503.0)
(0, 1449404468879408.0)
(0, 73926865935846.67)
(0, 510267568575845.6)
(0, 1278464864706061.8)
(0, 651441624349605.0)
(0, 490989380090781.94)
(0, 866379414223642.9)
(0, 1513780697835216.2)
(0, 225449786774743.0)
(0, 1

(0, 493135229549117.9)
(0, 1039900426825836.1)
(0, 493674518816918.1)
(0, 703120032819550.5)
(0, 87025541676672.8)
(0, 959989914015178.8)
(0, 1496315159915493.5)
(0, 877503860811449.1)
(0, 5148163292881976.0)
(0, 695002488646389.1)
(0, 373323913903064.1)
(0, 1067409535098191.1)
(0, 4077439717162124.0)
(0, 1234698149047816.8)
(0, 342288067704865.2)
(0, 1571146538089234.2)
(0, 738322879189075.1)
(0, 2127995191371417.5)
(0, 7741867675757525.0)
(0, 586140718469753.8)
(0, 2035037693976125.5)
(0, 2786464851814984.5)
(0, 2322370349500415.5)
(0, 852006688809970.5)
(0, 1830754045958647.5)
(0, 3409302463041375.0)
(0, 85676876496347.88)
(0, 1256899152436002.2)
(0, 774563190238699.6)
(0, 88997695685606.6)
(0, 1184315966128624.8)
(0, 4442010015243884.0)
(0, 567029238909165.2)
(0, 1545695525344553.5)
(0, 2981605660254055.0)
(0, 2709696717680806.0)
(0, 984195774476961.0)
(0, 292316194196022.6)
(0, 1391875633622335.2)
(0, 439903903936446.4)
(0, 101560964592902.42)
(0, 7949476268013975.0)
(0, 186929837

In [399]:
predictions = []

for ind, test_file in enumerate(test_desc_files):
    p = [x.split('/')[-1] for x in y_pred[ind].to_numpy()]
    predictions.append(f'{test_file},{" ".join(p)}\n')

In [401]:
with open('pred_out.csv', 'w+') as f:
    for pred in predictions:
        f.write(pred)