In [167]:
!ls data

[34mdescriptions_test[m[m     [34mfeatures_train[m[m        sample_submission.csv
[34mdescriptions_train[m[m    [34mimages_test[m[m           [34mtags_test[m[m
[34mfeatures_test[m[m         [34mimages_train[m[m          [34mtags_train[m[m


In [176]:
import numpy as np
import pandas as pd

import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score as cv
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import MultiLabelBinarizer

import gensim

from utils.data_parsers import load_dataframe as ld

In [177]:
dff = ld()

TypeError: list indices must be integers or slices, not str

In [None]:
dff.head()

In [107]:
def extract_tags_to_list(fp):
    
    with open(fp) as f:
        tags = f.readlines()

    return [tag.strip() for tag in tags]


def get_tags(df, train_or_test='train', imfile_column='image_file'):
    """Returns pandas series of image tags"""
    
    return df[imfile_column].apply(
        lambda x: extract_tags_to_list(
            f'data/tags_{train_or_test}/' + x.split('/')[-1].replace('jpg', 'txt')
        )
    )



def extract_tags_to_list_split(fp):
    
    with open(fp) as f:
        tags = f.readlines()
        
    tag_pairs = [tag.strip().split(':') for tag in tags]
    
    higher_cat = [x[0] for x in tag_pairs]
    lower_cat = [x[0] for x in tag_pairs]

    return higher_cat, lower_cat


def get_tags_split(df, train_or_test='train', imfile_column='image_file'):
    """Returns two pandas series of image tags, higher and lower category"""
    
    return zip(*df[imfile_column].map(
        lambda x: extract_tags_to_list_split(
            f'data/tags_{train_or_test}/' + x.split('/')[-1].replace('jpg', 'txt')
        )
    ))

In [114]:
with open('data/descriptions_train/0.txt') as f:
    x = f.readlines()

In [162]:
def open_description(fp):
    with open(fp) as f:
        descriptions = [x.strip() for x in f.readlines()]
        
    return descriptions

def get_descriptions(df, train_or_test='train', imfile_column='image_file'):
    """Descriptions are independant? lists of line by line description"""
    return df[imfile_column].apply(
        lambda x: open_description(f'data/descriptions_{train_or_test}/' + x.split('/')[-1].replace('jpg', 'txt'))
    )

In [164]:
descs = get_descriptions(df)

In [73]:
def get_resnet_features(train_or_test='train'):
    """Gets image_id -> 1000 dim feature vector"""
    
    fp = f'data/features_{train_or_test}/features_resnet1000_{train_or_test}.csv'
    lines = []
    
    with open(fp) as f:
        line = f.readline()
        while line:
            line = line.split(',')
            lines.append({"image_file": line[0], "resnet_vector": np.array([np.double(x.strip()) for x in line[1:]])})
            line = f.readline()
            
    return lines


def get_resnet_intermediate_features(train_or_test='train'):
    """Gets image_id -> 2048 dim intermediate feature vector"""
    
    fp = f'data/features_{train_or_test}/features_resnet1000intermediate_{train_or_test}.csv'
    lines = []

    with open(fp) as f:
        line = f.readline()
        while line:
            line = line.split(',')
            lines.append({"image_file": line[0], "resnet_vector": np.array([np.double(x.strip()) for x in line[1:]])})
            line = f.readline()
            
    return lines

In [74]:
features = get_resnet_features()
features_int = get_resnet_intermediate_features()

In [75]:
df = pd.DataFrame(features)

In [100]:
get_tags(df)

0       [vehicle:train, person:person, indoor:clock, a...
1                    [person:person, sports:baseball bat]
2       [appliance:refrigerator, appliance:oven, appli...
3                              [animal:dog, animal:sheep]
4                                           [animal:bear]
                              ...                        
9995                      [sports:frisbee, person:person]
9996    [vehicle:car, vehicle:bus, person:person, acce...
9997                         [person:person, sports:skis]
9998                       [food:broccoli, food:sandwich]
9999         [person:person, animal:giraffe, animal:bird]
Name: image_file, Length: 10000, dtype: object

In [134]:
model = gensim.models.KeyedVectors.load_word2vec_format('./models/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
def get_word2vec(df, model, desc_column='descriptions'):
    vecs = df[desc_column].apply(
        lambda x: model.get_vector(x)
    )

In [165]:
desc_join = descs.apply(lambda x: " ".join(x))

In [166]:
desc_join

0       a red train is docked at the station Several p...
1       A man with blue jersey holding a baseball bat....
2       A kitchen decorated in red and white with acce...
3       A black and white dog chasing sheep in a field...
4       Two bears with their mouths open in the water....
                              ...                        
9995    an image  of people outside playing frisbee a ...
9996    A red double decker bus driving down a city st...
9997    A man riding skis down a snow covered slope. a...
9998    a close up of a plate with broccoli a red plat...
9999    A large giraffe standing next to a man pushing...
Name: image_file, Length: 10000, dtype: object

### Applying naive bayes to BOW representation of description

In [169]:
mlb = MultiLabelBinarizer()
one_hot_encodings = mlb.fit_transform(desc_join)
one_hot_columns = mlb.classes_

train_one_hot = df.join(
    pd.DataFrame(
        one_hot_encodings,
        columns=one_hot_columns,
        index=df.index
    ))

train_one_hot = train_one_hot

train_one_hot.head()

Unnamed: 0,image_file,resnet_vector,Unnamed: 3,!,"""",#,&,',(,),...,q,r,s,t,u,v,w,x,y,z
0,images_train/5373.jpg,"[-0.8994496464729309, -0.9304700493812561, -2....",1,0,0,0,0,0,0,0,...,0,1,1,1,0,1,1,0,1,0
1,images_train/984.jpg,"[-1.3469539880752563, -3.1194605827331543, -0....",1,0,0,0,0,0,0,0,...,0,1,1,1,1,0,1,1,1,0
2,images_train/7127.jpg,"[-3.44549822807312, -1.5245732069015503, -1.00...",1,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,0,1,0
3,images_train/9609.jpg,"[1.1146496534347534, -2.1671018600463867, 0.09...",1,0,0,0,0,0,0,0,...,0,1,1,1,1,0,1,0,1,0
4,images_train/5293.jpg,"[1.6026496887207031, -1.5058174133300781, 3.02...",1,0,0,0,0,0,0,0,...,0,1,1,1,1,0,1,0,1,0
