----

In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn import preprocessing
from scipy import stats

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import matthews_corrcoef
from sklearn import tree

from sklearn.decomposition import TruncatedSVD, PCA, SparsePCA, NMF
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import make_scorer

In [2]:
# settings
np.seterr(divide='warn', invalid='warn'); sns.set_style("whitegrid");warnings.filterwarnings('ignore')

-----

# Project: Bosch Manufacturing Line

__Author:__ [Ryan Harper](www.kimrharper.com) <br><br>
__Data Source:__ [Bosch Dataset via Kaggle](https://www.kaggle.com/c/bosch-production-line-performance/data) <br> <br>
__Background:__ Bosch is a home appliance and industrial tools manufacturing company. In 2017, Bosch supplied Kaggle.com with manufacturing data to promote a competition. The goal of the competition was to determine factors that influence whether or not the product passes the final response stage of manufacturing and to predict which products are likely to fail based on this manufacturing process.<br> <br>
__The Data:__ Early exploration of this data will use a subset of the big data provided by Bosch. The data is provided by [Hitesh, John, and Matthew via PDX Data Science Meetup](https://www.meetup.com/Portland-Data-Science-Group/events/257370691/). The data subset is divided into 2 groups of 3 files (3 training, 3 test). Each group has one csv file each for numerical features ('numeric'), dates ('date'), and the manufacturing path ('cat'). The data subset includes a larger percentage of products that failed the response test, but not much more is known about this subsampling method.<br><br>
__Assumptions:__ ID # represents a specific product and that there is only one product. The differences in assembly are due to customization and/or differences between lines.<br><br>
__Goal:__ Predict which products will fail the response test. <br><br>

## III. Dimensionality Reduction

### A. Import Data

In [3]:
%%time
# import numerical data
mf_num_data = pd.read_csv('bosch_small_data/train_numeric.csv',low_memory=False)

CPU times: user 9.59 s, sys: 1.21 s, total: 10.8 s
Wall time: 10.9 s


### B. Declare Functions

In [4]:
def process_data(df):
    names = list(df.columns)
    impute_constant = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
    return impute_constant.fit_transform(df), names

def visualize_data(pipeline,dimred):
    feature_plot = list(zip(features, pipeline.named_steps[dimred].components_[0]))
    feature_plot = pd.DataFrame(data=feature_plot)
    feature_plot = pd.DataFrame(feature_plot.sort_values(1, ascending=False).iloc[0:10])
    plt.figure(figsize=(20,5))
    plt.title('Ordered by variance')
    sns.barplot(x=0, y=1, data=feature_plot, palette=sns.color_palette("cool"))
    plt.ylim(feature_plot[1].min(),feature_plot[1].max())
    plt.savefig(dimred+'-top10.png')
    plt.show()
     
    plt.figure(figsize=(20,8))
    plt.title('Component Variance')
    plt.plot(pipeline.named_steps[dimred].explained_variance_ratio_)

### C. Split data to x,y

In [28]:
X, features = process_data(mf_num_data.iloc[:,1:-1])
y = mf_num_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0, random_state=42)

In [26]:
logr = LogisticRegression(random_state=42, solver='liblinear', multi_class='ovr')
rf = RandomForestClassifier(random_state=42, n_estimators=200, max_depth=10)

In [38]:
pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
    ('reduce_dim', None),
    ('model', None)
])

# Candidates = 40 (5 Components * 2 RF Depth * 4 Feature Selection Methods)

N_FEATURES_OPTIONS = [500, 700, len(X[0,:])]
RF_DEPTH = [10, 50, 100]
RF_ESTIM =  [10, 50, 100]
LOGR_SOLVER = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

REDUCER_LABELS = ['PCA', 'TSVD', 'NMF','KBest']

param_grid = [
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'model': [RandomForestClassifier()],
        'model__max_depth': RF_DEPTH,
        'model__n_estimators': RF_ESTIM,
    },
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'model': [LogisticRegression(solver='liblinear', multi_class='ovr')],
        'model__solver': LOGR_SOLVER
    }

]

grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=4, scoring=make_scorer(matthews_corrcoef), verbose=10)

In [None]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 42 candidates, totalling 126 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  5.3min
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  8.6min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed: 14.7min


In [36]:
# mean_scores = np.array(grid.cv_results_['mean_test_score']).reshape(4,3,3)
grid_results = pd.DataFrame(grid.cv_results_)

In [37]:
grid_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_model__max_depth,param_model__n_estimators,param_reduce_dim,param_reduce_dim__n_components,params,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,54.470687,0.072315,2.561092,0.087748,"RandomForestClassifier(bootstrap=True, class_w...",10.0,10.0,"PCA(copy=True, iterated_power='auto', n_compon...",968,{'model': RandomForestClassifier(bootstrap=Tru...,...,0.172146,0.159997,0.173855,0.012073,2,0.230619,0.276971,0.253364,0.253651,0.018924
1,50.734237,19.919648,1.520189,0.12229,"LogisticRegression(C=1.0, class_weight=None, d...",,,"PCA(copy=True, iterated_power='auto', n_compon...",968,"{'model': LogisticRegression(C=1.0, class_weig...",...,0.191504,0.192631,0.201561,0.013434,1,0.198606,0.219803,0.21502,0.211143,0.009078


In [None]:
print('hi')