# Extract features, retrain Sherlock and generate predictions.

The script below first downloads the data (roughly 700K samples), then extract features from the raw data values. <br>
If you want to skip this step, you can follow the steps below the feature extraction to load the preprocessed data, 
retrain Sherlock and generate predictions.

In [1]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf

from sherlock import helpers
from sherlock.features.preprocessing import extract_features, convert_string_lists_to_lists, prepare_feature_extraction 
from sherlock.features.preprocessing import extract_features_chars, extract_features_embed, extract_features_words, extract_features_paras,extract_features_multi_thread
from sherlock.deploy.train_sherlock import train_sherlock
from sherlock.deploy.predict_sherlock import predict_sherlock

## Read in raw data
You can skip this step if you want to use a preprocessed data file.

In [2]:
train_samples = pd.read_parquet('../data/data/raw/train_values.parquet')
train_labels = pd.read_parquet('../data/data/raw/train_labels.parquet')
print(len(train_labels))

412059


In [3]:
validation_samples = pd.read_parquet('../data/data/raw/val_values.parquet')
validation_labels = pd.read_parquet('../data/data/raw/val_labels.parquet')
print(len(validation_labels))

137353


In [4]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')
test_labels = pd.read_parquet('../data/data/raw/test_labels.parquet')
print(len(test_labels))

137353


In [5]:
test_samples.head()

Unnamed: 0,values
20368,"['Central Missouri', 'unattached', 'unattached..."
664102,"[95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, ..."
366813,"['Katie Crews', 'Christian Hiraldo', 'Alex Est..."
530567,"['Christian', 'Non-Christian', 'Unreported', '..."
176253,"['AAF-McQuay Canada Inc.', 'AAF-McQuay Canada ..."


In [6]:
test_labels.head()

Unnamed: 0,type
20368,affiliation
664102,weight
366813,jockey
530567,religion
176253,company


## Extract features
It is important that the string-representations of lists are first converted into lists of strings.
The labels should be a list of semantic types.

In [7]:
# 3 miniutes 38 seconds
train_samples_converted, y_train = convert_string_lists_to_lists(train_samples.head(80000), train_labels.head(80000), "values", "type")
val_samples_converted, y_val = convert_string_lists_to_lists(validation_samples.head(10000), validation_labels.head(10000), "values", "type")
test_samples_converted, y_test = convert_string_lists_to_lists(test_samples.head(20000), test_labels.head(20000), "values", "type")

100%|███████████████████████████████████████████████████████████████████████████| 80000/80000 [01:36<00:00, 825.37it/s]
100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [01:03<00:00, 313.94it/s]


In [8]:
test_samples_converted.head()

20368     [Central Missouri, unattached, unattached, Kan...
664102    [95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, ...
366813    [Katie Crews, Christian Hiraldo, Alex Estrada,...
530567    [Christian, Non-Christian, Unreported, Jewish,...
176253    [AAF-McQuay Canada Inc., AAF-McQuay Canada Inc...
Name: values, dtype: object

In [9]:
len(test_samples_converted)

20000

In [10]:
# 每個sample都是list of strings
type(test_samples_converted.iloc[0])

list

In [11]:
test_samples_len = [len(x) for x in list(test_samples_converted)]
print(f"max len smaple:{np.max(test_samples_len)}")
print(f"min len smaple:{np.min(test_samples_len)}")

max len smaple:14641819
min len smaple:1


In [12]:
test_samples_converted[:2]

20368     [Central Missouri, unattached, unattached, Kan...
664102    [95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, ...
Name: values, dtype: object

## Extract features-2

In [None]:
%%time
X_train = extract_features(train_samples_converted.head(80000))
#X_val = extract_features(val_samples_converted.head(10000))
X_test = extract_features(test_samples_converted.head(20000))

Preparing feature extraction by downloading 2 files:
        
 ../sherlock/features/glove.6B.50d.txt and 
 ../sherlock/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
        
All files for extracting word and paragraph embeddings are present.


  regex = re.compile(pat, flags=flags)


Extracting features for data column: 100
Extracting features for data column: 200
Extracting features for data column: 300
Extracting features for data column: 400
Extracting features for data column: 500
Extracting features for data column: 600
Extracting features for data column: 700
Extracting features for data column: 800
Extracting features for data column: 900
Extracting features for data column: 1000
Extracting features for data column: 1100
Extracting features for data column: 1200
Extracting features for data column: 1300
Extracting features for data column: 1400
Extracting features for data column: 1500
Extracting features for data column: 1600
Extracting features for data column: 1700
Extracting features for data column: 1800
Extracting features for data column: 1900
Extracting features for data column: 2000
Extracting features for data column: 2100
Extracting features for data column: 2200
Extracting features for data column: 2300
Extracting features for data column: 2400
E

In [None]:
#%%time
#_ = extract_features_chars(test_samples_converted.head(100))

#_ = extract_features_words(test_samples_converted.head(100))

#_ = extract_features_paras(test_samples_converted.head(100))

#_ = extract_features_embed(test_samples_converted.head(100))

In [None]:
#list(X_test.keys())[:959] # character distribution(960)

In [None]:
#list(X_test.keys())[960:1161] # word embedding features(201)

In [None]:
list(X_test.keys())[1161:1188] # global statistic(27)

In [None]:
#list(X_test.keys())[1188:1588] # paragraph vector(400)

In [None]:
X_test.head()

In [None]:
X_test.shape

## Impute NaN values with feature means

In [None]:
train_columns_means = pd.DataFrame(X_train.mean()).transpose()

In [None]:
X_train.fillna(train_columns_means.iloc[0], inplace=True)
X_val.fillna(train_columns_means.iloc[0], inplace=True)
X_test.fillna(train_columns_means.iloc[0], inplace=True)

## Retrain sherlock
The model can be retrained using the code below. The model is currently restricted to be trained on 78 classes, the code of the model architecture will soon be added for adjusting this.

In [None]:
train_sherlock(X_train, y_train, X_test, y_test, nn_id='retrained_sherlock');
print('Trained and saved new model.')

## Generate predictions with a model
If you want to use the pretrained Sherlock model `nn_id` set to "sherlock".

If you want to use another model, you can use the identifier corresponding to that model.

**Note**: There is a bug somewhere in the refactored code which affects the model predictions, this should be fixed soon.

In [None]:
predicted_labels = predict_sherlock(X_test, nn_id='sherlock')

In [None]:
predicted_labels

In [None]:
y_test_subset

In [None]:
# Should be fully deterministic too.
f1_score(y_test_subset, predicted_labels[:25],average='weighted')

In [None]:
f1_score

In [None]:
pd.Series(predicted_labels).nunique()

In [None]:
pd.Series(y_test_subset).nunique()

## Generate predictions with preprocessed data using Sherlock

Requires the data to be downloaded from Google Drive (see first step in notebook).

In [None]:
X_test_preprocessed = pd.read_parquet("../data/data/processed/X_test.parquet")
y_test_preprocessed = pd.read_parquet("../data/data/processed/y_test.parquet").reset_index(drop=True)

In [None]:
X_test_preprocessed.head()

In [None]:
y_test_preprocessed.head()

In [None]:
predicted_labels = predict_sherlock(X_test_preprocessed, 'sherlock')

In [None]:
f1_score(y_test_preprocessed, predicted_labels, average='weighted')

In [None]:
pd.Series(predicted_labels).nunique()

In [None]:
pd.Series(y_test).nunique()

## How to prepare our data for Sherlock?