# Extract features, retrain Sherlock and generate predictions.

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>
If you want to skip this step, you can follow the steps below the feature extraction to load the preprocessed data, 
retrain Sherlock and generate predictions.

In [1]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [2]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw and preprocessed data into ../data/data.zip.
Data was downloaded.
Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


## Read in raw data
You can skip this step if you want to use a preprocessed data file.

In [3]:
train_samples = pd.read_parquet('../data/data/raw/train_values.parquet')
train_labels = pd.read_parquet('../data/data/raw/train_labels.parquet')

In [4]:
validation_samples = pd.read_parquet('../data/data/raw/val_values.parquet')
validation_labels = pd.read_parquet('../data/data/raw/val_labels.parquet')

In [5]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')
test_labels = pd.read_parquet('../data/data/raw/test_labels.parquet')

In [6]:
validation_samples.head()

Unnamed: 0,values
217858,"['Shelby', 'Knox', 'Washington', 'Davidson', '..."
166650,"['CDs, Adult Musical', 'CDs, Adult Musical', '..."
27611,"[20.42, 23.17, 24.5, 24.0, 21.92, 28.08, 25.5,..."
370305,"['Nick Scholfield', 'R Johnson', 'P J Brennan'..."
45377,"['', 'Tres Hombres (Expanded & Remastered)', '..."


In [7]:
validation_labels.head()

Unnamed: 0,type
217858,county
166650,collection
27611,age
370305,jockey
45377,album


## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [17]:
train_samples_converted, y_train = convert_string_lists_to_lists(train_samples, train_labels, "values", "type")
val_samples_converted, y_val = convert_string_lists_to_lists(validation_samples, validation_labels, "values", "type")
test_samples_converted, y_test = convert_string_lists_to_lists(test_samples, test_labels, "values", "type")  

100%|█████████████████████████████████████████████████████████████████████████| 412059/412059 [15:22<00:00, 446.47it/s]
100%|█████████████████████████████████████████████████████████████████████████| 137353/137353 [06:47<00:00, 336.97it/s]
100%|█████████████████████████████████████████████████████████████████████████| 137353/137353 [08:39<00:00, 264.41it/s]


In [30]:
train_samples_len = [len(x) for x in list(train_samples_converted)]
val_samples_len = [len(x) for x in list(val_samples_converted)]
test_samples_len = [len(x) for x in list(test_samples_converted)]

In [31]:
print(np.argmax(test_samples_len))
print(test_samples_len[7732])

7732
14641819


In [34]:
print(np.argmax(val_samples_len))
print(val_samples_len[18152])

18152
6550726


In [35]:
print(np.argmax(train_samples_len))
print(train_samples_len[341318])

341318
13401235


In [59]:
len(list(test_samples_converted)[7732:7733][0])

14641819

In [60]:
test_samples_converted[7732:7733]

682460    [2008, 2009, 2010, 2011, 2012, 2013, 2014, 200...
Name: values, dtype: object

In [63]:
X_test = extract_features(train_samples_converted[341318:341319])

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


In [64]:
X_test

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,-6.5e-05,-0.000912,0.00048,-0.000834,0.001091,0.000473,-0.000122,-0.001036,0.000357,-0.000818


### Given that feature extraction can take long, we only take the first 100 samples.

In [11]:
y_test_subset = y_test[:100]

In [12]:
X_test = extract_features(test_samples_converted.head(n=100))

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.
Extracting features for data column: 100


In [13]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,-0.000405,-4.7e-05,-0.000982,-0.000275,-0.000606,-0.000695,0.000364,0.000866,0.000858,-0.000961
1,True,False,0.285714,0.204082,0,1,0.0,2,-1.1,0.948683,...,0.001197,0.000342,-0.000429,-0.000671,-0.000899,0.000895,0.000675,-0.000735,-1.3e-05,0.000956
2,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,0.000756,-0.000835,-0.000991,-0.000187,-0.001149,-0.000839,9.8e-05,0.000365,-0.000567,0.00071
3,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,-0.000206,-0.000713,4.2e-05,0.000897,-0.001172,2.4e-05,-0.000105,0.000822,4e-05,-0.000936
4,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,-0.000446,-0.000987,-0.000234,0.000303,0.000408,-0.000387,-0.000341,0.001123,-0.000144,0.000734


## Impute NaN values with feature means

In [135]:
# train_columns_means = pd.DataFrame(X_train.mean()).transpose()

In [145]:
# X_train.fillna(train_columns_means.iloc[0], inplace=True)
# X_validation.fillna(train_columns_means.iloc[0], inplace=True)
# X_test.fillna(train_columns_means.iloc[0], inplace=True)

## Retrain sherlock
The model can be retrained using the code below. The model is currently restricted to be trained on 78 classes, the code of the model architecture will soon be added for adjusting this.

In [148]:
train_sherlock(X_train, y_train, X_val, y_val, nn_id='retrained_sherlock');
print('Trained and saved new model.')

## Generate predictions with a model
If you want to use the pretrained Sherlock model `nn_id` set to "sherlock".

If you want to use another model, you can use the identifier corresponding to that model.

**Note**: There is a bug somewhere in the refactored code which affects the model predictions, this should be fixed soon.

In [14]:
predicted_labels = predict_sherlock(X_test, nn_id='sherlock')

W1102 22:57:27.405504 4439883200 deprecation.py:506] From /Users/madelon/miniconda3/envs/sherlock-project/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1102 22:57:27.407250 4439883200 deprecation.py:506] From /Users/madelon/miniconda3/envs/sherlock-project/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:97: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1102 22:57:27.413321 4439883200 deprecation.py:506] From /Users/madelon/miniconda3/envs/sherlock-project/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py

In [15]:
# Should be fully deterministic too.
f1_score(y_test_subset, predicted_labels, average="weighted")

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.566047619047619

In [19]:
pd.Series(predicted_labels).nunique()

43

In [18]:
pd.Series(y_test_subset).nunique()

45

## Generate predictions with preprocessed data using Sherlock

Requires the data to be downloaded from Google Drive (see first step in notebook).

In [36]:
X_test_preprocessed = pd.read_parquet("../data/data/processed/X_test.parquet")
y_test_preprocessed = pd.read_parquet("../data/data/processed/y_test.parquet").reset_index(drop=True)

In [25]:
X_test_preprocessed.head()

Unnamed: 0,col_entropy,frac_unique,frac_numcells,frac_textcells,avg_num_cells,std_num_cells,avg_text_cells,std_text_cells,avg_spec_cells,std_spec_cells,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,2.122181,0.005,0.0,1.0,0.0,0.0,12.29,5.077194,0.0,0.0,...,0.023563,-0.029472,0.002835,0.090851,-0.125505,-0.027747,0.028412,-0.078901,0.054292,-0.049115
1,3.817487,0.015,1.0,0.0,2.058,0.233743,0.0,0.0,0.0,0.0,...,0.244085,-0.055574,0.0176,0.079978,-0.014825,0.006086,0.121871,-0.078689,-0.069111,-0.11255
2,3.166061,0.009,0.12,1.0,0.12,0.324962,11.527,2.68873,0.0,0.0,...,0.018266,-0.088117,-0.048036,-0.011286,-0.109643,-0.070223,-0.009666,-0.081991,-0.041528,-0.094458
3,2.316887,0.005,0.0,1.0,0.0,0.0,9.053,1.960151,0.0,0.0,...,-0.063415,-0.000197,0.01202,-0.033859,0.063092,0.075499,-0.009511,-0.070606,0.061907,0.065065
4,6.955528,0.163,0.018,1.0,0.072,0.531804,20.268,9.593132,0.0,0.0,...,0.015399,-0.213604,0.0291,-0.009626,-0.154028,-0.09047,-0.01395,0.036592,-0.139673,-0.11543


In [193]:
y_test_preprocessed.head()

Unnamed: 0_level_0,label
index,Unnamed: 1_level_1
511600,affiliation
146358,weight
665579,jockey
148486,religion
3546,company


In [194]:
predicted_labels = predict_sherlock(X_test_preprocessed.head(n=25), 'sherlock')

In [195]:
f1_score(y_test_preprocessed.head(n=25), predicted_labels, average='weighted')

0.8906666666666667