### Preprocessing
* http://cs231n.github.io/neural-networks-2/#datapre
* http://www.robots.ox.ac.uk/~vgg/practicals/cnn/#getting-started
* https://www.safaribooksonline.com/library/view/programming-computer-vision/9781449341916/ch01.html
* https://stackoverflow.com/a/10169025/5151861
* augmentation tf+keras http://machinelearningmastery.com/image-augmentation-deep-learning-keras/

#### Augmentation
* rotation: random with angle between 0° and 360° (uniform)
* translation: random with shift between -10 and 10 pixels (uniform)
* rescaling: random with scale factor between 1/1.6 and 1.6 (log-uniform)
* flipping: yes or no (bernoulli)
* shearing: random with angle between -20° and 20° (uniform)
* stretching: random with stretch factor between 1/1.3 and 1.3 (log-uniform)
* whitening
* https://www.tensorflow.org/api_docs/python/tf/image
* https://github.com/aleju/imgaug
* http://augmentor.readthedocs.io/en/master/
* https://github.com/analysiscenter/dataset

* uint8 0 to 255
* uint16 0 to 65535
* uint32 0 to 232
* float -1 to 1 or 0 to 1
* int8 -128 to 127
* int16 -32768 to 32767
* int32-231 to 231 - 1

In [20]:

import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import misc
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

% matplotlib inline

In [3]:
train_df = pd.read_csv('data/data_analys.csv', index_col=0)

### Load train transformed

In [4]:
% % time

dir_train_transformed = 'data/train_transformed'
# img parameters
img_shape = (64, 64)
channels = 3
img_shape_flattened = img_shape[0] * img_shape[1] * channels
img_qty = train_df.shape[0]

# initialize X,y
X = np.empty(shape=(img_qty, img_shape_flattened), dtype=np.int8)
y = np.empty(shape=(img_qty,), dtype=np.uint16)

# read images
for i, f_name in enumerate(os.listdir(dir_train_transformed)):
    if i % 5000 == 0:
        print('{:6d}/{:6d} images loaded'.format(i, img_qty))

    img_path = os.path.join(dir_train_transformed, f_name)
    X[i, :] = misc.imread(img_path).flatten('C')  # since img is np.ndarray, flatten in row-style
    y[i] = train_df.loc[train_df['image_name'] == f_name, 'target'].iloc[0]

     0/ 22897 images loaded
  5000/ 22897 images loaded
 10000/ 22897 images loaded
 15000/ 22897 images loaded
 20000/ 22897 images loaded
CPU times: user 1min 17s, sys: 4.05 s, total: 1min 21s
Wall time: 1min 27s


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

### Find optimal PCA components

In [None]:
% % time
pca = PCA(n_components=5000).fit(X_train)
print('{}'.format(np.cumsum(pca.explained_variance_ratio_)))

In [None]:
round(np.cumsum(pca.explained_variance_ratio_)[-1400], 4)

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Total explained variance')
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xticks(np.arange(0, 5000, 500))
plt.axhline(0.9, c='r');

10-48

In [6]:
% % time
pca = PCA(n_components=3600)

X_train_sc = X_train / 255
X_train_pca = pca.fit_transform(X_train_sc)

CPU times: user 21min 32s, sys: 1min 56s, total: 23min 28s
Wall time: 13min 51s


In [7]:
% % time
X_test_sc = X_test / 255
X_test_pca = pca.transform(X_test_sc)

CPU times: user 36.2 s, sys: 7.49 s, total: 43.7 s
Wall time: 37.8 s


In [8]:
reg = LogisticRegression(random_state=42,
                         n_jobs=-1,
                         class_weight='balanced')

In [21]:
param_dist = {'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
              'penalty': ['l1', 'l2']}

n_iter_search = 20
random_search = RandomizedSearchCV(reg, param_distributions=param_dist, verbose=2, n_jobs=-1)

In [None]:
from time import time

start = time()
random_search.fit(X_train_pca, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] penalty=l1, C=0.7 ...............................................
[CV] penalty=l1, C=0.7 ...............................................
[CV] penalty=l1, C=0.7 ...............................................
[CV] penalty=l1, C=0.4 ...............................................


In [9]:
% % time
reg.fit(X_train_pca, y_train)

[LibLinear]CPU times: user 1h 55min 11s, sys: 1min 21s, total: 1h 56min 32s
Wall time: 2h 1min 36s


LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=50,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=2, warm_start=False)

In [11]:
accuracy(y_train, reg.predict(X_train_pca))
accuracy(y_test, reg.predict(X_test_pca))

accuracy: 0.9669931989767268
accuracy: 0.08471615720524017


In [1]:
def accuracy(y, y_pred):
    print('accuracy: {}'.format(accuracy_score(y, y_pred)))

### Making pipeline

In [None]:
pca = PCA(n_components=50)
svc = LogisticRegression(random_state=42, C=0.1, max_iter=50, verbose=2, n_jobs=-1, class_weight='balanced')
scaler = StandardScaler()

model = Pipeline((('scaler', scaler), ('dim_reduction', pca), ('classifier', svc)))

In [None]:
% % time
model.fit(X_train, y_train)

### Load validation transformed set

In [12]:
% % time

train_df = pd.read_csv('data/data_analys.csv', index_col=0)
dir_test_transformed = 'data/test_transformed'

X_val = np.empty(shape=(img_qty, img_shape_flattened), dtype=np.int8)

# read images
for i, f_name in enumerate(os.listdir(dir_test_transformed)):
    if i % 2000 == 0:
        print('{} images loaded'.format(i))

    img_path = os.path.join(dir_test_transformed, f_name)
    X_val[i, :] = misc.imread(img_path).flatten('C')  # since img is np.ndarray, flatten in row-style

0 images loaded
2000 images loaded
4000 images loaded
6000 images loaded
CPU times: user 3.6 s, sys: 1.33 s, total: 4.94 s
Wall time: 5.87 s


### Normalization and dimensionality reduction for validation set

In [None]:
X_val_sc = X_val / 255
X_val_pca = pca.transform(X_val_sc)

In [16]:
% % time

y_pred = reg.predict(X_val_pca)

sub = pd.DataFrame(
    data=[row for row in zip(os.listdir(dir_test_transformed), y_pred.astype(int).tolist())],
    columns=['image', 'class']
)
sub.to_csv('lr_baseline.csv', encoding='utf-8', index=False)

CPU times: user 2min 8s, sys: 1min 4s, total: 3min 13s
Wall time: 2min 39s
