# Get output from hidden layers - latent image representation


#### used for the PCA plot to compare feature space

In [None]:
import matplotlib.pyplot as plt
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from fastai.imports import *
from fastai.torch_imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *


In [None]:
torch.cuda.set_device(0)
os.chdir('/home/jgmeyer2/fastai/courses/dl1/')
PATH = "data/data_rmsalt_rmol/pics/"
sz = 500
arch = resnext101_64
bs = 50

## functions


In [None]:

def get_val_cv_byclass(label_csv):
    label_df = pd.read_csv(label_csv)
    val_idxs = []
    for x in label_df['class'].unique(): ### should be class but reversed column labels
        start= label_df.index[label_df['class'] == x].tolist()[0]
        end = start+len(label_df.index[label_df['class'] == x].tolist())-1
        n_sample= int(round((end-start)*0.2,0))
        val_idxs.append(random.sample(range(start,end),n_sample))
    val_idxs = list(chain.from_iterable(val_idxs))
    return val_idxs

def get_val_idx_fromfile(validx_csv):
    validx_df =pd.read_csv(validx_csv, header=None)
    return validx_df[0].tolist()
    
    
def get_data(sz, bs, val_idxs, label_csv): # sz: image size, bs: batch size
    tfms = tfms_from_model(arch, sz)
    data = ImageClassifierData.from_csv(PATH, 'train', label_csv,
                                       val_idxs=val_idxs, suffix='.png', tfms=tfms, bs=bs, num_workers=4)
    return data if sz > 300 else data.resize(340, 'tmp') # Reading the jpgs and resizing is slow for big images, so resizing them all to 340 first saves time

## make a convolutional learner object, dont train it - save it

In [None]:
label_csv = f'{PATH}12cls_rmsaltol.csv'
vacc =[]
rep=1
start=1
valididx_base = '12cls_val_ids'
val_idxs = get_val_idx_fromfile(f'{PATH}'+valididx_base+str(rep+start)+'.csv')
data = get_data(sz, bs, val_idxs, label_csv)
learn = ConvLearner.pretrained(arch, data, precompute=False, ps=0)
#learn.fit(1e-2, 100)
print(f'{PATH}'+valididx_base+str(rep+start)+'.csv')

In [None]:
### load that model
learn.save('12cls_MTfinal.model')
learn.load('12cls_MTfinal.model')
model = learn.model
model = model.eval()

In [None]:
### get nn.module minus 'n' last layers, in this case minus 3 layers
class nnBottom(nn.Module):
    def __init__(self, original_model):
        super(nnBottom, self).__init__()
        self.features = nn.Sequential(*list(original_model.children())[:-3])  ### change the number to how many to remove
    def forward(self, x):
        x = self.features(x)
        return x
nnbot = nnBottom(model)


In [None]:
model

In [None]:
nnbot

In [None]:
nnbot = nnbot.eval()

#### make a mini image loader so we can get latent vector from any image

In [None]:
os.chdir('/home/jgmeyer2/fastai/courses/dl1/')
from torchvision import transforms
imsize = 500
loader = transforms.Compose([transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

def image_loader(image_name):
    """load image, returns cuda tensor"""
    image = Image.open(image_name)
    image = loader(image).float()
    image = Variable(image, requires_grad=True)
    image = image.unsqueeze(0)  #this is for VGG, may not be needed for ResNet
    return image.cuda()  #assumes that you're using GPU

image = image_loader('data/drugs_no_overlap/pics/train/antineoplastic/0.png')

latent = nnbot(image).data.cpu().numpy()
#pic

In [None]:
len(latent.tolist()[0])

In [None]:
latent.tolist()[0]

# Loop through all the pictures, record their names, class, and the 512 array

In [None]:
import os
os.chdir('/home/jgmeyer2/fastai/courses/dl1/data/data_rmsalt_rmol/pics/train/')

In [None]:
files = !find . -name "*png"

In [None]:
len(files)

In [None]:

### try the layer before that one

#### write the latent values to file with their classes

In [None]:
latents = []
for file in files:
    latents += nnbot(image_loader(file)).data.cpu().numpy().tolist()

In [None]:
with open("latent_12cls_minus3.csv", "w") as f:
    for i in range(len(files)):
        f.write(files[i]+','+str(latents[i]).strip('[]')+'\n')

#### read the file you wrote

In [None]:
import pandas as pd
df = pd.read_csv("latent_12cls_minus3.csv")

In [None]:
df.head()

In [None]:
df_latent =df[df.columns[1:]]

In [None]:
fls = df[df.columns[0]]

In [None]:
sample_labels = [label.split('/')[1] for label in fls]

In [None]:
fls

In [None]:
sample_labels

#### do PCA then plot with bokeh

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 10)
PCA = pca.fit_transform(df_latent)

print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.sum())

PC1s = []
PC2s = []
for PCs in PCA:
    PC1 = PCs[0]
    PC2 = PCs[1]
    PC1s.append(PC1)
    PC2s.append(PC2)

    ### make color dict later


#colors = [colormap[x] for x in sample_labels]

In [None]:
from bokeh.transform import factor_cmap
from bokeh.palettes import Category20

#condition = [i.replace(" ","_") for i in sample_labels]
condition = sample_labels

#data = {'PC1':PC1,
#    'PC2':PC2,
#    'PC1_variance':PC1_variance,
#    'PC2_variance':PC2_variance,
#    'condition':condition,}

non_redundant_category_list = ['antiinfective', 'cns', 'lipidregulating', 'urological', 'hematologic', 'reproductivecontrol', 'antineoplastic', 'dermatologic', 'gastrointestinal', 'respiratorysystem', 'antiinflammatory', 'cardio']
palette = Category20[min([len(non_redundant_category_list) , 20])]
fill = factor_cmap('condition', palette=palette,
    factors=non_redundant_category_list)

print(fill)
#plot.circle(x='PC1',y='PC2', source=source,
#    alpha=0.5, fill_color=fill, size=20)

In [None]:
print(palette)
print(non_redundant_category_list)

In [None]:


from bokeh.plotting import figure, show, output_file, ColumnDataSource
from bokeh.io import output_notebook, save
from bokeh.io import save as sv

source = ColumnDataSource(data=dict(
    x=PC1s,
    y=PC2s,
    condition=sample_labels,
    desc = sample_labels
))

TOOLTIPS = [
    ("index", "$index"),
    ("(x,y)", "($x, $y)"),
    ("desc", "@desc"),
]

output_notebook()
#('latents_12cls_v3trained2ep.html')

p = figure(title = "PCA latent", x_range=(-4, 9), y_range=(-3.5, 7), tooltips=TOOLTIPS, plot_width=800, plot_height=500)
p.xaxis.axis_label = 'PC1 ({})'.format(round(pca.explained_variance_ratio_[0],2))
p.yaxis.axis_label = 'PC2 ({})'.format(round(pca.explained_variance_ratio_[1],2))
#p.output_backend = "png"
p.circle('x','y', fill_alpha=1, size=5, source=source, legend ='desc' , fill_color=factor_cmap('condition', palette=palette,
    factors=non_redundant_category_list))
p.output_backend = "svg"
show(p)


In [None]:
from bokeh.plotting import figure
from bokeh.resources import CDN
from bokeh.embed import file_html

plot = figure()
plot.circle([1,2], [3,4])

html = file_html(plot, CDN, "my plot")