----

In [8]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn import preprocessing
from scipy import stats

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import matthews_corrcoef
from sklearn import tree

from sklearn.decomposition import TruncatedSVD, PCA, SparsePCA, NMF
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import make_scorer

In [2]:
# settings
np.seterr(divide='warn', invalid='warn'); sns.set_style("whitegrid");warnings.filterwarnings('ignore')

-----

# Project: Bosch Manufacturing Line

__Author:__ [Ryan Harper](www.kimrharper.com) <br><br>
__Data Source:__ [Bosch Dataset via Kaggle](https://www.kaggle.com/c/bosch-production-line-performance/data) <br> <br>
__Background:__ Bosch is a home appliance and industrial tools manufacturing company. In 2017, Bosch supplied Kaggle.com with manufacturing data to promote a competition. The goal of the competition was to determine factors that influence whether or not the product passes the final response stage of manufacturing and to predict which products are likely to fail based on this manufacturing process.<br> <br>
__The Data:__ Early exploration of this data will use a subset of the big data provided by Bosch. The data is provided by [Hitesh, John, and Matthew via PDX Data Science Meetup](https://www.meetup.com/Portland-Data-Science-Group/events/257370691/). The data subset is divided into 2 groups of 3 files (3 training, 3 test). Each group has one csv file each for numerical features ('numeric'), dates ('date'), and the manufacturing path ('cat'). The data subset includes a larger percentage of products that failed the response test, but not much more is known about this subsampling method.<br><br>
__Assumptions:__ ID # represents a specific product and that there is only one product. The differences in assembly are due to customization and/or differences between lines.<br><br>
__Goal:__ Predict which products will fail the response test. <br><br>

## III. Dimensionality Reduction

### A. Import Data

In [3]:
%%time
# import numerical data
mf_num_data = pd.read_csv('bosch_small_data/train_numeric.csv',low_memory=False)

Wall time: 9.73 s


### B. Declare Functions

In [9]:
def process_data(df):
    names = list(df.columns)
    pt = PowerTransformer()
    df = pt.fit_transform(df)
    impute_constant = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
    
    return impute_constant.fit_transform(df), names

def visualize_data(pipeline,dimred):
    feature_plot = list(zip(features, pipeline.named_steps[dimred].components_[0]))
    feature_plot = pd.DataFrame(data=feature_plot)
    feature_plot = pd.DataFrame(feature_plot.sort_values(1, ascending=False).iloc[0:10])
    plt.figure(figsize=(20,5))
    plt.title('Ordered by variance')
    sns.barplot(x=0, y=1, data=feature_plot, palette=sns.color_palette("cool"))
    plt.ylim(feature_plot[1].min(),feature_plot[1].max())
    plt.savefig(dimred+'-top10.png')
    plt.show()
     
    plt.figure(figsize=(20,8))
    plt.title('Component Variance')
    plt.plot(pipeline.named_steps[dimred].explained_variance_ratio_)

### C. Split data to x,y

In [10]:
X, features = process_data(mf_num_data.iloc[:,1:-1])
y = mf_num_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0, random_state=42)

In [11]:
logr = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr')
rf = RandomForestClassifier(random_state=42, n_estimators=200, max_depth=10)

In [18]:
pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
    ('reduce_dim', None),
    ('model', None)
])

# Candidates = 40 (5 Components * 2 RF Depth * 4 Feature Selection Methods)

N_FEATURES_OPTIONS = [500, 700, len(X[0,:])]
RF_DEPTH = [10, 50, 100]
RF_ESTIM =  [10, 50, 100]
LOGR_SOLVER = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

REDUCER_LABELS = ['PCA', 'TSVD', 'NMF','KBest']

param_grid = [
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'model': [RandomForestClassifier()],
        'model__max_depth': RF_DEPTH,
        'model__n_estimators': RF_ESTIM,
    },
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'model': [LogisticRegression(solver='liblinear', multi_class='ovr')],
        'model__solver': LOGR_SOLVER
    }

]

grid = GridSearchCV(pipe, 
                    param_grid=param_grid, 
                    cv=5, 
                    n_jobs=4, 
                    scoring=make_scorer(matthews_corrcoef), 
                    verbose=10)

In [19]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  4.9min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  8.0min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed: 11.9min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 17.7min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed: 22.6min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 29.0min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed: 42.9min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed: 65.5min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed: 70.7min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed: 86.4min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed: 115.5min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 126.8min
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed: 135.3min
[Parall

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None, steps=[('reduce_dim', None), ('model', None)]),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid=[{'reduce_dim': [PCA(copy=True, iterated_power='auto', n_components=700, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)], 'reduce_dim__n_components': [500, 700, 968], 'model': [RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_dept...verbose=0, warm_start=False)], 'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(matthews_corrcoef), verbose=10)

In [20]:
grid_results = pd.DataFrame(grid.cv_results_)
columns = ['param_model',
           'param_reduce_dim__n_components',
           'param_model__max_depth',
           'param_model__n_estimators',
           'param_model__solver',
           'mean_test_score',
           'rank_test_score',
           'mean_fit_time']
grid_results = grid_results[columns]
grid_results['param_model']=grid_results['param_model'].apply(lambda val: str(val).split('(')[0])
grid_results = grid_results.sort_values('rank_test_score')
grid_results.to_html('pca_models_evaluation_with_transform.html')

In [21]:
grid_results

Unnamed: 0,param_model,param_reduce_dim__n_components,param_model__max_depth,param_model__n_estimators,param_model__solver,mean_test_score,rank_test_score,mean_fit_time
22,RandomForestClassifier,700,100.0,50.0,,0.282738,1,262.765939
25,RandomForestClassifier,700,100.0,100.0,,0.281311,2,457.207442
26,RandomForestClassifier,968,100.0,100.0,,0.279937,3,464.82819
24,RandomForestClassifier,500,100.0,100.0,,0.27816,4,372.544712
23,RandomForestClassifier,968,100.0,50.0,,0.27733,5,244.219769
21,RandomForestClassifier,500,100.0,50.0,,0.276821,6,184.26377
14,RandomForestClassifier,968,50.0,50.0,,0.268774,7,216.491611
16,RandomForestClassifier,700,50.0,100.0,,0.266387,8,435.313497
17,RandomForestClassifier,968,50.0,100.0,,0.266177,9,413.59191
13,RandomForestClassifier,700,50.0,50.0,,0.265098,10,244.62159
