----

In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn import preprocessing
from scipy import stats

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.metrics import matthews_corrcoef
from sklearn import tree

In [2]:
# settings
np.seterr(divide='warn', invalid='warn'); sns.set_style("whitegrid");warnings.filterwarnings('ignore')

-----

# Project: Bosch Manufacturing Line

__Author:__ [Ryan Harper](www.kimrharper.com) <br><br>
__Data Source:__ [Bosch Dataset via Kaggle](https://www.kaggle.com/c/bosch-production-line-performance/data) <br> <br>
__Background:__ Bosch is a home appliance and industrial tools manufacturing company. In 2017, Bosch supplied Kaggle.com with manufacturing data to promote a competition. The goal of the competition was to determine factors that influence whether or not the product passes the final response stage of manufacturing and to predict which products are likely to fail based on this manufacturing process.<br> <br>
__The Data:__ Early exploration of this data will use a subset of the big data provided by Bosch. The data is provided by [Hitesh, John, and Matthew via PDX Data Science Meetup](https://www.meetup.com/Portland-Data-Science-Group/events/257370691/). The data subset is divided into 2 groups of 3 files (3 training, 3 test). Each group has one csv file each for numerical features ('numeric'), dates ('date'), and the manufacturing path ('cat'). The data subset includes a larger percentage of products that failed the response test, but not much more is known about this subsampling method.<br><br>
__Assumptions:__ ID # represents a specific product and that there is only one product. The differences in assembly are due to customization and/or differences between lines.<br><br>
__Goal:__ Predict which products will fail the response test. <br><br>

## III. Dimensionality Reduction

### A. Import Data

In [3]:
from sklearn.linear_model import LinearRegression, LogisticRegression

In [4]:
%%time
# import numerical data
mf_num_data = pd.read_csv('bosch_small_data/train_numeric.csv',low_memory=False)

CPU times: user 9.13 s, sys: 1.19 s, total: 10.3 s
Wall time: 10.4 s


### B. Declare Functions

In [86]:
def process_data(df):
    names = list(df.columns)
    impute_constant = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
    return impute_constant.fit_transform(df), names

def visualize_data(pipeline,dimred):
    feature_plot = list(zip(features, pipeline.named_steps[dimred].components_[0]))
    feature_plot = pd.DataFrame(data=feature_plot)
    feature_plot = pd.DataFrame(feature_plot.sort_values(1, ascending=False).iloc[0:10])
    plt.figure(figsize=(20,5))
    plt.title('Ordered by variance')
    sns.barplot(x=0, y=1, data=feature_plot, palette=sns.color_palette("cool"))
    plt.ylim(feature_plot[1].min(),feature_plot[1].max())
    plt.savefig(dimred+'-top10.png')
    plt.show()
     
    plt.figure(figsize=(20,8))
    plt.title('Component Variance')
    plt.plot(pipeline.named_steps[dimred].explained_variance_ratio_)

### C. Split data to x,y

In [99]:
from sklearn.decomposition import TruncatedSVD, PCA, SparsePCA, NMF
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import make_scorer

In [209]:
X, features = process_data(mf_num_data.iloc[:,1:-1])
y = mf_num_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.3, random_state=42)

In [None]:
rf.

In [241]:
pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
    ('reduce_dim', None),
    ('rf', rf)
])

# Candidates = 40 (5 Components * 2 RF Depth * 4 Feature Selection Methods)

N_FEATURES_OPTIONS = [50,100,500,700]
RF_DEPTH = [5, 10, 15]
RF_ITER = [10, 20, 30]

REDUCER_LABELS = ['PCA', 'TSVD', 'NMF','KBest']

param_grid = [
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'rf__max_depth': RF_DEPTH,
        'rf__n_estimators': RF_ITER
    },

]

grid = GridSearchCV(pipe, param_grid=param_grid, n_jobs=4, scoring=make_scorer(matthews_corrcoef), verbose=5)

In [242]:
# digits = load_digits()
grid.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   40.5s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  6.4min
[Parallel(n_jobs=4)]: Done 108 out of 108 | elapsed: 19.1min finished


CPU times: user 1min 3s, sys: 4.22 s, total: 1min 7s
Wall time: 19min 51s


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('reduce_dim', None), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid=[{'reduce_dim': [PCA(copy=True, iterated_power='auto', n_components=700, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)], 'reduce_dim__n_components': [50, 100, 500, 700], 'rf__max_depth': [5, 10, 15], 'rf__n_estimators': [10, 20, 30]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(matthews_corrcoef), verbose=5)

In [280]:
mean_scores = np.array(grid.cv_results_['mean_test_score']).reshape(4,3,3)
grid_results = pd.DataFrame(grid.cv_results_)
columns = ['param_reduce_dim__n_components','param_rf__max_depth','param_rf__n_estimators','mean_test_score','rank_test_score','mean_fit_time']
grid_results = grid_results[columns]

In [283]:
grid_results.sort_values('rank_test_score')

Unnamed: 0,param_reduce_dim__n_components,param_rf__max_depth,param_rf__n_estimators,mean_test_score,rank_test_score,mean_fit_time
34,700,15,20,0.180832,1,76.435648
26,500,15,30,0.180494,2,61.436293
24,500,15,10,0.17853,3,45.387945
35,700,15,30,0.177658,4,65.885553
25,500,15,20,0.173983,5,49.364575
33,700,15,10,0.170077,6,71.040724
30,700,10,10,0.164118,7,59.490359
21,500,10,10,0.162946,8,41.090583
23,500,10,30,0.161443,9,62.034734
32,700,10,30,0.159654,10,88.757953


In [271]:
grid_results['param_reduce_dim'].apply(lambda val: str(val).split()[2])

0     n_components=700,
1     n_components=700,
2     n_components=700,
3     n_components=700,
4     n_components=700,
5     n_components=700,
6     n_components=700,
7     n_components=700,
8     n_components=700,
9     n_components=700,
10    n_components=700,
11    n_components=700,
12    n_components=700,
13    n_components=700,
14    n_components=700,
15    n_components=700,
16    n_components=700,
17    n_components=700,
18    n_components=700,
19    n_components=700,
20    n_components=700,
21    n_components=700,
22    n_components=700,
23    n_components=700,
24    n_components=700,
25    n_components=700,
26    n_components=700,
27    n_components=700,
28    n_components=700,
29    n_components=700,
30    n_components=700,
31    n_components=700,
32    n_components=700,
33    n_components=700,
34    n_components=700,
35    n_components=700,
Name: param_reduce_dim, dtype: object

In [269]:
str(grid_results['param_reduce_dim'][0]).split()[2]

'n_components=700,'