# Create meta-features
I suspect this might be the most practical use of these data sets.  I get the impression that the importance of these sets is roughly  numeric>timestamp>categorical.  But it would be nice to actually confirm this.  I suspect creating a meta feature of each data set and trying to combine them will give me some idea of which sets will perform better this way.

## Method
For each individual data set (categorical, date), I'll try to find a reasonably good model, create a feature from the predicted output.  Then save the output, and pickle model for later use.

### Baseline
I'm trying to be better than a guess, so uniform baseline for me.

### Basic model
Compare a few basic linear models to see which works best with minimal tuning.

### Permutation Importance
After running that basic model, use permutation to figure out which features are the most important, and to build a model with only features that positively impact my model.

### Rebuild model
Run the model with the most important features, try some gentle hyper parameter tuning.

### Predict the entire training set
predict the whole dataset with chunking, and save the output.

### Save
Pickle the model, so it can be used on test sets and other notebooks. Save the predictions and possibly the probabilities as a separate dataframe to be merged with my main dataset in other notebooks.

## Evaluation
I'm evaluating for precision, based off the confusion matrix in my main notebook. I'm also keeping an eye on MCC, as that is the metric used by Bosch to evaluate their model.

In [1]:
import pandas as pd

folder = 'bosch-production-line-performance/'

response = pd.read_csv(folder + 'train_numeric.csv', usecols = ['Response'], 
               squeeze = True)

date_iter = pd.read_csv(folder + 'train_date.csv', iterator = True, 
                            chunksize = 1000)

In [2]:
print(response.shape)
response.head()

(1183747,)


0    0
1    0
2    0
3    0
4    0
Name: Response, dtype: int64

In [3]:
#through experimentation I've found the following size to work without killing
#my kernel.  The range strictly comes from trying and failing until I found
#a happy middle ground.

chunks = []

for i in range(300):
    chunks.append(date_iter.get_chunk())
    
date = pd.concat(chunks, ignore_index = True)

print(date.shape)
date.head()

(300000, 1157)


Unnamed: 0,Id,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,...,L3_S50_D4246,L3_S50_D4248,L3_S50_D4250,L3_S50_D4252,L3_S50_D4254,L3_S51_D4255,L3_S51_D4257,L3_S51_D4259,L3_S51_D4261,L3_S51_D4263
0,4,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,82.24,...,,,,,,,,,,
1,6,,,,,,,,,,...,,,,,,,,,,
2,7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,1618.7,...,,,,,,,,,,
3,9,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,1149.2,...,,,,,,,,,,
4,11,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,602.64,...,,,,,,,,,,


In [4]:
date.describe()

Unnamed: 0,Id,L0_S0_D1,L0_S0_D3,L0_S0_D5,L0_S0_D7,L0_S0_D9,L0_S0_D11,L0_S0_D13,L0_S0_D15,L0_S0_D17,...,L3_S50_D4246,L3_S50_D4248,L3_S50_D4250,L3_S50_D4252,L3_S50_D4254,L3_S51_D4255,L3_S51_D4257,L3_S51_D4259,L3_S51_D4261,L3_S51_D4263
count,300000.0,173237.0,173237.0,173237.0,173237.0,173237.0,173237.0,173237.0,173237.0,173237.0,...,7712.0,7712.0,7712.0,7712.0,7712.0,15232.0,15232.0,15232.0,15232.0,15232.0
mean,300028.38943,877.045789,877.045789,877.045789,877.045789,877.045789,877.045789,877.045789,877.045789,877.045789,...,1028.82354,1028.82354,1028.82354,1028.82354,1028.82354,1032.643642,1032.643642,1032.643642,1032.643642,1032.643642
std,173184.607155,504.865347,504.865347,504.865347,504.865347,504.865347,504.865347,504.865347,504.865347,504.865347,...,427.24779,427.24779,427.24779,427.24779,427.24779,428.756326,428.756326,428.756326,428.756326,428.756326
min,4.0,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,...,1.32,1.32,1.32,1.32,1.32,1.38,1.38,1.38,1.38,1.38
25%,150115.75,392.48,392.48,392.48,392.48,392.48,392.48,392.48,392.48,392.48,...,558.15,558.15,558.15,558.15,558.15,558.16,558.16,558.16,558.16,558.16
50%,299820.0,904.83,904.83,904.83,904.83,904.83,904.83,904.83,904.83,904.83,...,1225.235,1225.235,1225.235,1225.235,1225.235,1291.26,1291.26,1291.26,1291.26,1291.26
75%,449836.5,1365.34,1365.34,1365.34,1365.34,1365.34,1365.34,1365.34,1365.34,1365.34,...,1399.3625,1399.3625,1399.3625,1399.3625,1399.3625,1408.41,1408.41,1408.41,1408.41,1408.41
max,600180.0,1713.71,1713.71,1713.71,1713.71,1713.71,1713.71,1713.71,1713.71,1713.71,...,1457.5,1457.5,1457.5,1457.5,1457.5,1457.5,1457.5,1457.5,1457.5,1457.5


In [5]:
#okay things look as I expected.  Everything is a float, so I should stick with
#linear models.
date.dtypes.value_counts()

float64    1156
int64         1
dtype: int64

In [6]:
#time to define X, y and our train test split
X = date.drop(columns = 'Id')
y = response.iloc[: X.shape[0]]

X.shape, y.shape

((300000, 1156), (300000,))

### Baseline

In [7]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import precision_score, matthews_corrcoef

base_model = DummyClassifier(strategy = 'uniform', random_state = 42)

base_model.fit(X, y)

y_pred_base = base_model.predict(X)

def metrics(y_true, y_pred):
    #We are setting zero division to 0, because a dataset this imbalance
    #is very likely generate warnings otherwise.
    print('Our Precision is: ', precision_score(y_true, y_pred, 
                                               zero_division = 0))
    print('Our MCC is: ', matthews_corrcoef(y_true, y_pred))
    
metrics(y, y_pred_base)

Our Precision is:  0.005613488353509929
Our MCC is:  -0.0004876871771416818


### Basic Model

In [8]:
#I'm going to use CV for my validation, but I still want to have
#test set for evaluating my final model.  Making sure to stratify
#because of how imbalanced my set is.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = .20,
    random_state = 42,
    shuffle = True,
    stratify = y
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((240000, 1156), (60000, 1156), (240000,), (60000,))

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

In [10]:
#Lest do a tried and true logistic regressor
log_pipe = make_pipeline(
    SimpleImputer(strategy = 'median'),
    StandardScaler(),
    LogisticRegression(n_jobs = -1, random_state = 42)
)

In [11]:
log_pipe.fit(X_train, y_train)

y_pred_log = log_pipe.predict(X_test)

metrics(y_test, y_pred_log)

Our Precision is:  0.0
Our MCC is:  0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [12]:
#Oh wow, that is not a good sign.  My model didn't make a single correct
#prediction.  Lets take a look at these.
pd.Series(y_pred_log).value_counts()

0    60000
dtype: int64

In [13]:
y_test.value_counts()

0    59661
1      339
Name: Response, dtype: int64

In [14]:
#okay, yikes.  Lets see if we have any importance from these features.

### Permutation Importance

In [15]:
#gotta break apart my pipeline for this

import eli5
from eli5.sklearn import PermutationImportance

transformer = make_pipeline(
    SimpleImputer(strategy = 'median'),
    StandardScaler(),
)

model = LogisticRegression(n_jobs = -1, 
                       random_state = 42)

X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

model.fit(X_train_transformed, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=42,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [16]:
X_test_transformed.shape, X_test.shape, X_train_transformed.shape, X_train.shape

((60000, 1154), (60000, 1156), (240000, 1154), (240000, 1156))

In [17]:
#Now lets set up the permuter

permuter = PermutationImportance(
    model,
    scoring = 'precision',
    n_iter = 5,
    random_state = 42,
)

#can't get the warnings supressed.  Tried pass **kwargs but couldn't get them
#down.  I apologize for the scrolling.
permuter.fit(X_test_transformed, y_test)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

PermutationImportance(cv='prefit',
                      estimator=LogisticRegression(C=1.0, class_weight=None,
                                                   dual=False,
                                                   fit_intercept=True,
                                                   intercept_scaling=1,
                                                   l1_ratio=None, max_iter=100,
                                                   multi_class='auto',
                                                   n_jobs=-1, penalty='l2',
                                                   random_state=42,
                                                   solver='lbfgs', tol=0.0001,
                                                   verbose=0,
                                                   warm_start=False),
                      n_iter=5, random_state=42, refit=True,
                      scoring='precision')

In [18]:
#Simple imputer removes columns of entirely nans, so we lost some dimensionality
#that will mess up how our columns line up with the eli5.show_weights. So I 
#need to manually adjust for that
removed_features = []
for col in X_train.columns:
    if X_train[col].isnull().sum() == X_train.shape[0]:
        removed_features.append(col)
removed_features

['L1_S24_D1158', 'L3_S46_D4135']

In [19]:
#lets see if any features are predictive
feature_names = X_test.columns.tolist()

#getting the difference from total feature names and the removed ones
dif = []
for i in feature_names:
    if not i in removed_features:
        dif.append(i)
        
feature_names = dif 
                            
eli5.show_weights(
    permuter,
    top = None,
    feature_names = feature_names
)

  rel_weight = (abs(weight) / weight_range) ** 0.7


Weight,Feature
0  ± 0.0000,L3_S51_D4263
0  ± 0.0000,L1_S24_D1116
0  ± 0.0000,L1_S24_D1513
0  ± 0.0000,L1_S24_D1515
0  ± 0.0000,L1_S24_D1517
0  ± 0.0000,L1_S24_D1519
0  ± 0.0000,L1_S24_D1522
0  ± 0.0000,L1_S24_D1527
0  ± 0.0000,L1_S24_D1532
0  ± 0.0000,L1_S24_D1536


In [20]:
#well damn, my model is 100% not predictive.  Lets give it a quick show with
#XGBoost and see if I can get anyuthing out of it.

XGBC_pipe = make_pipeline(
    SimpleImputer(strategy = 'median'),
    StandardScaler(),
    XGBClassifier(objective = 'binary:logistic', booster = 'gblinear', 
                  n_estimators = 100, n_jobs = -1, random_state = 42, )
)

# fit_params = {
#     'xgbclassifier__early_stopping_rounds': 50
# }

XGBC_pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='median',
                               verbose=0)),
                ('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gblinear',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=None,
                               n_estimators=100, n_jobs=-1, nthread=None,
                               objective='binary:logistic', random_state=42,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               

In [21]:
y_pred_XGBC = XGBC_pipe.predict(X_test)

metrics(y_test, y_pred_XGBC)

Our Precision is:  0.0
Our MCC is:  0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [22]:
pd.Series(y_pred_XGBC).value_counts()

0    60000
dtype: int64

In [23]:
#Okay, linear models aren't really working out.  Chances are there is no signal
#in the date time data set lets throw a decision tree at it, and if we get
#nothing out of that, then call it good.

from xgboost import XGBRFClassifier

XGBRF = XGBRFClassifier(n_estimators = 300, n_jobs = -1, random_state = 42)

XGBRF.fit(X_train, y_train)

XGBRFClassifier(base_score=0.5, colsample_bylevel=1, colsample_bynode=0.8,
                colsample_bytree=1, gamma=0, learning_rate=1, max_delta_step=0,
                max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
                n_jobs=-1, nthread=None, objective='binary:logistic',
                random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                seed=None, silent=None, subsample=0.8, verbosity=1)

In [24]:
y_pred_XGBRF = XGBRF.predict(X_test)

metrics(y_test, y_pred_XGBRF)

Our Precision is:  0.0
Our MCC is:  0.0


In [25]:
pd.Series(y_pred_XGBRF).value_counts()

0    60000
dtype: int64

## Conclusion
The date data set is not a predictive data set, and can't be used for building a meta feature based off that.  If I'm going to build a meta feature, I should
consider a different approach like PCA.

### PCA
Admittedly this is a long shot.  I can't have too many columns in my final dataset, so I'm really going to have to reduce this down to not overburden my
main set (the numeric set).

In [26]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 40)

pca_data = pca.fit_transform(transformer.fit_transform(X))

print(pca_data.shape)

(300000, 40)


In [27]:
#I expect that I lost some info, but my goal is to get around 5%

lost = round((1 - sum(pca.explained_variance_ratio_)) * 100)

print(f'PCA results in {lost}% information loss.')

PCA results in 4.0% information loss.


### Run PCA on entire set

In [28]:
#okay so we have our pca object and our transformer objec already fit from
#early in the nb.  So lets just start over, chunk in, transform, and then
#concat the final product. Hopefully still have enough memory over head to
#pull this off.

date_iter = pd.read_csv(folder + 'train_date.csv', iterator = True, 
                            chunksize = 1000)

pca_chunks = []

for chunk in date_iter:
    
    dropped = chunk.drop(columns = 'Id')
    
    transformed = transformer.transform(dropped)
    
    reduced = pca.transform(transformed)
    
    cast = pd.DataFrame(reduced)
    
    pca_chunks.append(cast)
    
final = pd.concat(pca_chunks, ignore_index = True)

print(final.shape)

(1183747, 40)


In [29]:
#lets verify that length looks right
response.shape

(1183747,)

In [30]:
#sure does
final.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,-21.277703,-0.289303,-3.781839,2.924193,-0.247516,-2.712722,-5.437389,2.898192,-0.392224,-0.471372,...,-0.085715,0.215877,-0.067401,0.193341,-4.393336,1.733587,-1.559871,-0.218652,-0.126592,0.078238
1,15.437912,0.63699,1.034117,-3.457132,1.74432,-2.372314,-12.450526,8.762215,-1.112995,-0.43197,...,0.098552,0.099795,0.032712,-0.084232,2.459793,-1.107789,0.851111,0.117162,0.082846,-0.069254
2,24.148106,0.590774,1.884949,-4.37214,0.504915,1.386108,6.269827,-3.972378,0.591631,0.16997,...,-0.022926,0.250228,0.100459,-0.074634,4.466325,-1.761693,1.498853,0.208921,0.150736,-0.103819
3,8.464764,0.619026,0.083966,-1.807404,0.436423,0.084959,2.342196,-1.750583,0.295041,0.051257,...,-0.063144,-0.032562,-0.01702,0.055989,-1.486591,0.72694,-0.508969,-0.074943,-0.05229,0.027339
4,-9.745723,0.685201,-2.014617,1.182212,0.35839,-1.423569,-2.223115,0.822852,-0.049559,-0.093966,...,-0.04769,-0.253529,0.013794,-0.049225,1.406425,-0.462912,0.535388,0.069199,0.034055,-0.06362


In [31]:
#lets get some better column names before saving this.

new_cols = {}

for col in final.columns:
    new_cols[col] = 'pca_date_' + str(col)
    
new_cols

{0: 'pca_date_0',
 1: 'pca_date_1',
 2: 'pca_date_2',
 3: 'pca_date_3',
 4: 'pca_date_4',
 5: 'pca_date_5',
 6: 'pca_date_6',
 7: 'pca_date_7',
 8: 'pca_date_8',
 9: 'pca_date_9',
 10: 'pca_date_10',
 11: 'pca_date_11',
 12: 'pca_date_12',
 13: 'pca_date_13',
 14: 'pca_date_14',
 15: 'pca_date_15',
 16: 'pca_date_16',
 17: 'pca_date_17',
 18: 'pca_date_18',
 19: 'pca_date_19',
 20: 'pca_date_20',
 21: 'pca_date_21',
 22: 'pca_date_22',
 23: 'pca_date_23',
 24: 'pca_date_24',
 25: 'pca_date_25',
 26: 'pca_date_26',
 27: 'pca_date_27',
 28: 'pca_date_28',
 29: 'pca_date_29',
 30: 'pca_date_30',
 31: 'pca_date_31',
 32: 'pca_date_32',
 33: 'pca_date_33',
 34: 'pca_date_34',
 35: 'pca_date_35',
 36: 'pca_date_36',
 37: 'pca_date_37',
 38: 'pca_date_38',
 39: 'pca_date_39'}

In [32]:
final.rename(columns = new_cols, inplace = True)

final.head()

Unnamed: 0,pca_date_0,pca_date_1,pca_date_2,pca_date_3,pca_date_4,pca_date_5,pca_date_6,pca_date_7,pca_date_8,pca_date_9,...,pca_date_30,pca_date_31,pca_date_32,pca_date_33,pca_date_34,pca_date_35,pca_date_36,pca_date_37,pca_date_38,pca_date_39
0,-21.277703,-0.289303,-3.781839,2.924193,-0.247516,-2.712722,-5.437389,2.898192,-0.392224,-0.471372,...,-0.085715,0.215877,-0.067401,0.193341,-4.393336,1.733587,-1.559871,-0.218652,-0.126592,0.078238
1,15.437912,0.63699,1.034117,-3.457132,1.74432,-2.372314,-12.450526,8.762215,-1.112995,-0.43197,...,0.098552,0.099795,0.032712,-0.084232,2.459793,-1.107789,0.851111,0.117162,0.082846,-0.069254
2,24.148106,0.590774,1.884949,-4.37214,0.504915,1.386108,6.269827,-3.972378,0.591631,0.16997,...,-0.022926,0.250228,0.100459,-0.074634,4.466325,-1.761693,1.498853,0.208921,0.150736,-0.103819
3,8.464764,0.619026,0.083966,-1.807404,0.436423,0.084959,2.342196,-1.750583,0.295041,0.051257,...,-0.063144,-0.032562,-0.01702,0.055989,-1.486591,0.72694,-0.508969,-0.074943,-0.05229,0.027339
4,-9.745723,0.685201,-2.014617,1.182212,0.35839,-1.423569,-2.223115,0.822852,-0.049559,-0.093966,...,-0.04769,-0.253529,0.013794,-0.049225,1.406425,-0.462912,0.535388,0.069199,0.034055,-0.06362


### Save

In [33]:
#Since I couldn't find a predictive model, and the PCA reduction has already
#been processed for the whole dataset, then all I need to save is the final data
#set

final.to_csv('wrangled-sets/pca_date.csv', index = False)