# Extract features, retrain Sherlock and generate predictions.

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>
If you want to skip this step, you can follow the steps below the feature extraction to load the preprocessed data, 
retrain Sherlock and generate predictions.

In [1]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock

## Download data
This will download the raw values and preprocessed files, the corresponding labels as well as a few other supporting files:
- `download_data()` will download 3.6GB of data into the `data/` directory.
- `prepare_feature_extraction()` will download +/- 800 MB of data into the `features/` directory.

In [2]:
helpers.download_data()
prepare_feature_extraction()

Downloading the raw and preprocessed data into ../data/data.zip.
Data was downloaded.
Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


In [3]:
#SSLError: HTTPSConnectionPool(host='docs.google.com', port=443): 
#Max retries exceeded with url: /uc?export=download&id=1kayd5oNRQm8-NCvA8pIrtezbQ-B1_Vmk 
#(Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)')))

## Read in raw data
You can skip this step if you want to use a preprocessed data file.

In [4]:
train_samples = pd.read_parquet('../data/data/raw/train_values.parquet')
train_labels = pd.read_parquet('../data/data/raw/train_labels.parquet')
print(len(train_labels))

412059


In [5]:
validation_samples = pd.read_parquet('../data/data/raw/val_values.parquet')
validation_labels = pd.read_parquet('../data/data/raw/val_labels.parquet')
print(len(validation_labels))

137353


In [6]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')
test_labels = pd.read_parquet('../data/data/raw/test_labels.parquet')
print(len(test_labels))

137353


In [7]:
test_samples.head()

Unnamed: 0,values
20368,"['Central Missouri', 'unattached', 'unattached..."
664102,"[95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, ..."
366813,"['Katie Crews', 'Christian Hiraldo', 'Alex Est..."
530567,"['Christian', 'Non-Christian', 'Unreported', '..."
176253,"['AAF-McQuay Canada Inc.', 'AAF-McQuay Canada ..."


In [8]:
test_labels.head()

Unnamed: 0,type
20368,affiliation
664102,weight
366813,jockey
530567,religion
176253,company


In [9]:
type(test_samples.iloc[0])

pandas.core.series.Series

## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [10]:
# 3 miniutes 38 seconds
test_samples_converted, y_test = convert_string_lists_to_lists(test_samples, test_labels, "values", "type")

100%|█████████████████████████████████████████████████████████████████████████| 137353/137353 [03:39<00:00, 625.55it/s]


In [11]:
test_samples_converted.head()

20368     [Central Missouri, unattached, unattached, Kan...
664102    [95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, ...
366813    [Katie Crews, Christian Hiraldo, Alex Estrada,...
530567    [Christian, Non-Christian, Unreported, Jewish,...
176253    [AAF-McQuay Canada Inc., AAF-McQuay Canada Inc...
Name: values, dtype: object

In [12]:
len(test_samples_converted)

137353

In [13]:
# 每個sample都是list of strings
type(test_samples_converted.iloc[0])

list

In [None]:
test_samples_len = [len(x) for x in list(test_samples_converted)]
print(f"max len smaple:{np.max(test_samples_len)}")
print(f"min len smaple:{np.min(test_samples_len)}")

### Given that feature extraction can take long, we only take the first 100 samples.

In [15]:
y_test_subset = y_test[:25]

In [16]:
print(y_test_subset[:10])

['affiliation', 'weight', 'jockey', 'religion', 'company', 'grades', 'area', 'component', 'company', 'manufacturer']


### What is Doc2Vec?

In [17]:
np.load("../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy")

array([[-0.00870995, -0.00090878, -0.01173229, ..., -0.01383935,
         0.01876198, -0.08971333],
       [-0.05415047, -0.02325091, -0.05282537, ...,  0.00680359,
        -0.05299571, -0.08228445],
       [-0.17294699,  0.08445425, -0.13954757, ..., -0.14642408,
         0.01981621,  0.00363814],
       ...,
       [ 0.159118  , -0.120624  , -0.01007248, ..., -0.03977996,
         0.06222615, -0.20168892],
       [-0.2347014 ,  0.05774752, -0.06889017, ..., -0.05309976,
        -0.13383879, -0.07816506],
       [-0.01524658, -0.06477965, -0.00332214, ..., -0.03715399,
         0.04627442, -0.10461713]], dtype=float32)

## Extract feature-2

In [21]:
X_test = extract_features(test_samples_converted.head(n=1500))
#無法下載時，go head to download on website
#gensim對於model沒有neg_label屬性該如何補救 ?

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.
Extracting features for data column: 100
Extracting features for data column: 200


KeyboardInterrupt: 

In [32]:
X_test.head()

Unnamed: 0,n_[0]-agg-any,n_[0]-agg-all,n_[0]-agg-mean,n_[0]-agg-var,n_[0]-agg-min,n_[0]-agg-max,n_[0]-agg-median,n_[0]-agg-sum,n_[0]-agg-kurtosis,n_[0]-agg-skewness,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,0.000948,-0.000673,0.001054,-0.001001,0.000216,-0.0008,-0.000337,-7.1e-05,-0.001136,0.000982
1,True,False,0.285714,0.204082,0,1,0.0,2,-1.1,0.948683,...,-0.000106,0.001087,0.000957,-0.000543,-0.000262,-0.000594,-0.000299,0.000577,0.000869,0.00115
2,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,-0.000602,-0.000142,0.000751,0.000775,-0.001135,0.000224,0.001165,5.1e-05,-0.000282,-5e-05
3,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,0.000452,0.00122,0.000133,-0.000771,-0.000916,-0.000937,-0.000586,0.000505,3.2e-05,-0.000184
4,False,False,0.0,0.0,0,0,0.0,0,-3.0,0.0,...,-0.000535,1.4e-05,-0.000896,0.00042,-0.000129,-0.000553,-0.000922,-8.8e-05,-0.000957,8.2e-05


In [33]:
X_test.shape

(1500, 1588)

## Impute NaN values with feature means

In [34]:
# we have no new data, so we full NaN on take testing set.
train_columns_means = pd.DataFrame(X_test.mean()).transpose()

In [35]:
# X_train.fillna(train_columns_means.iloc[0], inplace=True)
# X_validation.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

## Retrain sherlock
The model can be retrained using the code below. The model is currently restricted to be trained on 78 classes, the code of the model architecture will soon be added for adjusting this.

In [36]:
train_sherlock(X_test, y_test[:1500], X_test, y_test[:1500], nn_id='retrained_sherlock');
print('Trained and saved new model.')

Successfully loaded and compiled model, now fitting model on data.
Train on 1500 samples, validate on 1500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/1

## Generate predictions with a model
If you want to use the pretrained Sherlock model `nn_id` set to "sherlock".

If you want to use another model, you can use the identifier corresponding to that model.

**Note**: There is a bug somewhere in the refactored code which affects the model predictions, this should be fixed soon.

In [37]:
predicted_labels = predict_sherlock(X_test, nn_id='sherlock')

In [38]:
predicted_labels

array(['team Name', 'depth', 'jockey', ..., 'age', 'status', 'rank'],
      dtype=object)

In [39]:
y_test_subset

['affiliation',
 'weight',
 'jockey',
 'religion',
 'company',
 'grades',
 'area',
 'component',
 'company',
 'manufacturer',
 'weight',
 'genre',
 'album',
 'origin',
 'description',
 'status',
 'credit',
 'team Name',
 'artist',
 'address',
 'age',
 'album',
 'club',
 'description',
 'family']

In [43]:
# Should be fully deterministic too.
f1_score(y_test_subset, predicted_labels[:25],average='weighted')

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.5199999999999999

In [41]:
f1_score

<function sklearn.metrics.classification.f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None)>

In [44]:
pd.Series(predicted_labels).nunique()

69

In [45]:
pd.Series(y_test_subset).nunique()

21

## Generate predictions with preprocessed data using Sherlock

Requires the data to be downloaded from Google Drive (see first step in notebook).

In [46]:
X_test_preprocessed = pd.read_parquet("../data/data/processed/X_test.parquet")
y_test_preprocessed = pd.read_parquet("../data/data/processed/y_test.parquet").reset_index(drop=True)

In [47]:
X_test_preprocessed.head()

Unnamed: 0,col_entropy,frac_unique,frac_numcells,frac_textcells,avg_num_cells,std_num_cells,avg_text_cells,std_text_cells,avg_spec_cells,std_spec_cells,...,par_vec_390,par_vec_391,par_vec_392,par_vec_393,par_vec_394,par_vec_395,par_vec_396,par_vec_397,par_vec_398,par_vec_399
0,2.122181,0.005,0.0,1.0,0.0,0.0,12.29,5.077194,0.0,0.0,...,0.023563,-0.029472,0.002835,0.090851,-0.125505,-0.027747,0.028412,-0.078901,0.054292,-0.049115
1,3.817487,0.015,1.0,0.0,2.058,0.233743,0.0,0.0,0.0,0.0,...,0.244085,-0.055574,0.0176,0.079978,-0.014825,0.006086,0.121871,-0.078689,-0.069111,-0.11255
2,3.166061,0.009,0.12,1.0,0.12,0.324962,11.527,2.68873,0.0,0.0,...,0.018266,-0.088117,-0.048036,-0.011286,-0.109643,-0.070223,-0.009666,-0.081991,-0.041528,-0.094458
3,2.316887,0.005,0.0,1.0,0.0,0.0,9.053,1.960151,0.0,0.0,...,-0.063415,-0.000197,0.01202,-0.033859,0.063092,0.075499,-0.009511,-0.070606,0.061907,0.065065
4,6.955528,0.163,0.018,1.0,0.072,0.531804,20.268,9.593132,0.0,0.0,...,0.015399,-0.213604,0.0291,-0.009626,-0.154028,-0.09047,-0.01395,0.036592,-0.139673,-0.11543


In [48]:
y_test_preprocessed.head()

Unnamed: 0,label
0,affiliation
1,weight
2,jockey
3,religion
4,company


In [51]:
predicted_labels = predict_sherlock(X_test_preprocessed, 'sherlock')

In [52]:
f1_score(y_test_preprocessed, predicted_labels, average='weighted')

0.8855186356849649

In [53]:
pd.Series(predicted_labels).nunique()

78

In [55]:
pd.Series(y_test).nunique()

78

## How to prepare our data for Sherlock?