In [84]:
import pandas as pd
import numpy as np

ResNet = pd.read_csv('./data/features_train/features_resnet1000_train.csv', header=None)
ResNet.columns = ['fnum'] + list(range(1000))
ResNet['fnum'] = ResNet['fnum'].apply(lambda x: int(x.split('/')[-1].split('.')[0]))
ResNet.sort_values('fnum', inplace = True)
ResNet.set_index('fnum', inplace = True)

# Manual implementation of Softmax
# Matrix of probabilities

# img num 1: [ P(class=1), P(class=2), ...]
# img num 2: [ P(class=1), P(class=2), ...]
probabilities = np.exp(ResNet.values)/np.exp(ResNet.values).sum(axis=1, keepdims=True)

In [141]:
from glob import glob
from collections import defaultdict, Counter

def counter_to_df_row(counter, index):
    if bool(counter):
        row = pd.DataFrame.from_dict(counter, orient='index').transpose()
        row.index = [index]
        return row
    else:
        return pd.DataFrame(index=[index])

def get_tags_from(fname):
    with open(fname) as f:
        fnum = int(fname.split('/')[-1].split('.')[0])
        tags = f.read().splitlines()
        categories = Counter([tag.split(':')[0] for tag in tags])
        subcategories = Counter([tag.split(':')[1] for tag in tags])
        
        cat_row = counter_to_df_row(categories, fnum)
        subcat_row = counter_to_df_row(subcategories, fnum)
        
    return cat_row, subcat_row
        

files = glob('./data/tags_train/*')
all_tags = [get_tags_from(file) for file in files]
cats, subcats = tuple(zip(*all_tags))

cats = reduce(lambda x, y: x.append(y), cats)
cats.fillna(0, inplace=True)

subcats = reduce(lambda x, y: x.append(y), subcats)
subcats.fillna(0, inplace=True)

In [162]:
# Matrix of probabilities

# category 1: [ P(class=1), P(class=2), ...]
# category 2: [ P(class=1), P(class=2), ...]
cats_to_probs = cats.values.transpose().dot(probabilities)
cats_to_probs = cats_to_probs/cats_to_probs.sum(axis=1, keepdims=True)
subcats_to_probs = subcats.values.transpose().dot(probabilities)
subcats_to_probs/subcats_to_probs.sum(axis=1, keepdims=True)

array([[  1.21305701e-04,   1.40984897e-04,   2.97007562e-04, ...,
          2.76207843e-05,   2.48732388e-03,   1.29685029e-04],
       [  1.69934765e-05,   1.60778673e-05,   2.31273623e-04, ...,
          1.31897164e-05,   2.56906690e-04,   1.55202307e-04],
       [  6.68842731e-05,   2.63773947e-05,   4.24204094e-05, ...,
          3.62173011e-05,   2.66744823e-04,   1.52151430e-03],
       ..., 
       [  9.62382692e-05,   6.10002604e-05,   2.49914923e-04, ...,
          1.08952755e-04,   4.33915352e-04,   2.94091406e-04],
       [  1.34492847e-04,   2.83187871e-05,   7.69276794e-05, ...,
          2.75692242e-04,   3.88887448e-04,   2.99520696e-04],
       [  4.68911309e-05,   3.55254613e-05,   2.73872479e-05, ...,
          1.35055768e-04,   2.94261187e-04,   6.58043543e-04]])

In [166]:
class_probs = probabilities.sum(axis=0)/probabilities.sum()
class_probs

array([  3.89798417e-04,   7.93448853e-05,   8.04735907e-05,
         4.91655990e-05,   7.67258482e-05,   1.47928225e-04,
         2.07902478e-05,   4.21038936e-05,   8.29219566e-05,
         5.86505107e-04,   5.46321996e-04,   2.78416173e-04,
         5.24198018e-04,   4.61787563e-05,   2.90393948e-04,
         1.46964827e-04,   2.86144788e-04,   7.87667450e-05,
         3.32525973e-05,   7.43686242e-05,   1.08164655e-04,
         7.25716764e-04,   3.42249438e-05,   1.28132746e-04,
         2.52651220e-05,   1.08912751e-05,   6.79277303e-05,
         1.26886316e-05,   8.83177374e-06,   1.40138019e-04,
         1.42861211e-05,   2.22134786e-05,   6.60237378e-05,
         6.46027340e-05,   1.73928812e-04,   2.07047207e-05,
         8.40178237e-05,   7.62187669e-06,   1.57340746e-04,
         1.60314752e-05,   8.45128545e-06,   1.12822496e-04,
         2.24860497e-05,   5.79183640e-05,   1.50512729e-05,
         3.21508296e-05,   4.87685138e-06,   5.76454080e-05,
         1.17821234e-05,

In [168]:
(probabilities/class_probs).shape

(10000, 1000)