In [3]:
import os
import torch
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import numpy as np
import torch.nn as nn
import torchvision.models as models
from torch.autograd import Variable
import pandas as pd
data_dir = os.path.join("./", "breast")

In [4]:
resnet18 = models.resnet18(pretrained=True)
modules=list(resnet18.children())[:-1]
resnet18=nn.Sequential(*modules)
for p in resnet18.parameters():
    p.requires_grad = False;

In [6]:
### Extract features from dataset including all resolutions

In [5]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

dataset = datasets.ImageFolder(data_dir, transform=transform)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

image_features = np.zeros((513, 1))

resnet18.eval()
 
with torch.no_grad():
    for data in dataloader: 
        image, label = data
        encoding = resnet18(image) 

        encoding_np = encoding.numpy()
        label = label.numpy()

        encoding_np = encoding_np.reshape(1, 512)
        encoding_np = np.append(encoding_np, label)
        encoding_np = encoding_np.reshape(513, 1)

        image_features = np.append(image_features, encoding_np, axis=1)
        

        

# drop first row of zeros
image_features_df = pd.DataFrame(image_features)
image_features_df = image_features_df.iloc[:, 1:]
print(image_features_df)

         1         2         3         4         5         6         7     \
0    0.531274  0.594747  0.590899  0.722041  0.465432  0.322455  0.520599   
1    0.139732  0.348665  0.318642  0.289941  0.255532  0.188617  0.339818   
2    0.337814  0.550642  0.594561  0.482355  0.506674  0.407469  0.211697   
3    1.492059  1.213679  1.026551  1.200261  1.401562  1.081025  1.293290   
4    0.270016  0.288044  0.612079  0.660533  0.628363  0.218116  0.069697   
..        ...       ...       ...       ...       ...       ...       ...   
508  0.105080  0.219287  0.172791  0.211580  0.149814  0.238355  0.406067   
509  1.018662  1.056771  0.957266  1.212001  1.137472  1.204388  0.621209   
510  0.318730  0.158945  0.156660  0.212991  0.118137  0.092524  0.370001   
511  0.126240  0.048812  0.265091  0.366576  0.206605  0.364724  0.293995   
512  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

         8         9         10    ...      3739      3740      3741  \
0  

In [9]:
image_features_df.to_csv('features_all.csv', index=False)

In [10]:
image_features_df_T = image_features_df.T
image_features_df_T = pd.DataFrame(image_features_df_T)
labels = image_features_df_T.iloc[:, 512]
print(labels.shape)
print(image_features_df_T.shape)


(3748,)
(3748, 513)


In [11]:
feature_set_x = image_features_df_T.iloc[:, :512]

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

X_train, X_test, y_train, y_test = train_test_split(feature_set_x, labels, stratify = labels, test_size=0.3)

model_all = LogisticRegressionCV(max_iter = 5000)
model_all.fit(X_train, y_train)
s = model_all.score(X_test, y_test)
print("The score of the model is: ", s)

The score of the model is:  0.9057777777777778


In [19]:
data = pd.read_csv("logistic_regression_40x_features.csv", index_col=0)
data


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,512
1,0.271028,0.208780,0.152388,1.255196,0.453387,0.204274,0.396641,0.662290,2.381847,0.763311,...,0.031166,0.476478,1.420786,0.130925,0.158760,0.308012,1.168912,0.035456,0.235910,0.0
2,0.431392,0.269095,0.121084,1.369295,0.165390,0.200180,0.755715,1.141574,2.679910,0.191846,...,0.001182,0.298568,1.560464,0.437376,0.089650,0.352928,1.118457,0.044374,0.135839,0.0
3,0.189157,0.248151,0.175657,1.353558,0.167985,0.283405,0.603197,0.640468,1.740507,0.496436,...,0.028693,0.393584,2.182479,0.250761,0.181518,0.213455,0.870525,0.115242,0.084740,0.0
4,0.181322,0.431159,0.183613,1.435808,0.244700,0.258571,0.376346,1.099299,1.455429,0.377059,...,0.042131,0.269249,1.557632,0.267773,0.207152,0.158261,1.115834,0.108024,0.225783,0.0
5,0.557896,0.148965,0.297226,1.440755,0.265388,0.100576,0.482407,1.267099,3.177758,0.391448,...,0.015477,0.503995,1.973315,0.350742,0.277134,0.234877,1.566571,0.074112,0.155174,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1951,0.409747,0.057417,0.243835,1.170132,0.145170,0.518152,1.096409,0.612581,2.396959,0.879318,...,0.014551,0.868811,1.065686,0.072270,0.250466,0.306638,0.781481,0.102368,0.035240,1.0
1952,0.487585,0.064823,0.133818,0.907975,0.518479,0.533047,0.954128,0.767510,1.636675,0.891331,...,0.010044,0.961497,1.074510,0.163653,0.259980,0.249659,0.396892,0.080202,0.217580,1.0
1953,0.432066,0.216163,0.218408,1.073746,0.319600,0.340214,0.694728,0.311092,1.669877,1.160774,...,0.045588,0.781521,0.898151,0.232340,0.660326,0.627356,0.317466,0.116915,0.460110,1.0
1954,0.078024,0.146581,0.063674,0.879625,0.198507,0.515858,0.855469,0.895579,1.506367,0.461575,...,0.016633,0.383521,1.302146,0.322049,0.366671,0.466623,0.363642,0.070426,0.396443,1.0


In [23]:
feature_set_x_40 = data.iloc[:, :512]
labels_40 = data.iloc[:, 512]

In [26]:
X_train_40, X_test_40, y_train_40, y_test_40 = train_test_split(feature_set_x_40, labels_40, stratify = labels_40, test_size=0.3)

model_40 = LogisticRegressionCV(max_iter = 5000)
model_40.fit(X_train_40, y_train_40)

LogisticRegressionCV(max_iter=5000)

In [27]:
model_40.score(feature_set_x, labels)



0.8025613660618997