----

In [116]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
from sklearn import preprocessing
from scipy import stats

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import matthews_corrcoef
from sklearn import tree

from sklearn.decomposition import TruncatedSVD, PCA, SparsePCA, NMF
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import RFE
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVR
from sklearn.metrics import make_scorer

In [117]:
# settings
np.seterr(divide='warn', invalid='warn'); sns.set_style("whitegrid");warnings.filterwarnings('ignore')

-----

# Project: Bosch Manufacturing Line

__Author:__ [Ryan Harper](www.kimrharper.com) <br><br>
__Data Source:__ [Bosch Dataset via Kaggle](https://www.kaggle.com/c/bosch-production-line-performance/data) <br> <br>
__Background:__ Bosch is a home appliance and industrial tools manufacturing company. In 2017, Bosch supplied Kaggle.com with manufacturing data to promote a competition. The goal of the competition was to determine factors that influence whether or not the product passes the final response stage of manufacturing and to predict which products are likely to fail based on this manufacturing process.<br> <br>
__The Data:__ Early exploration of this data will use a subset of the big data provided by Bosch. The data is provided by [Hitesh, John, and Matthew via PDX Data Science Meetup](https://www.meetup.com/Portland-Data-Science-Group/events/257370691/). The data subset is divided into 2 groups of 3 files (3 training, 3 test). Each group has one csv file each for numerical features ('numeric'), dates ('date'), and the manufacturing path ('cat'). The data subset includes a larger percentage of products that failed the response test, but not much more is known about this subsampling method.<br><br>
__Assumptions:__ ID # represents a specific product and that there is only one product. The differences in assembly are due to customization and/or differences between lines.<br><br>
__Goal:__ Predict which products will fail the response test. <br><br>

## III. Dimensionality Reduction

### A. Import Data

In [118]:
%%time
# import numerical data
mf_num_data = pd.read_csv('bosch_small_data/train_numeric.csv',low_memory=False)

# import date data
mf_date_data = pd.read_csv('bosch_small_data/train_date.csv',low_memory=False)

Wall time: 21.1 s


### B. Functions and Declarations

In [119]:
# process X values with transformation and imputation if need
def process_data(df, transform):
    names = list(df.columns)
    if transform:
        pt = PowerTransformer()
        df = pt.fit_transform(df)
    impute_constant = SimpleImputer(missing_values=np.nan, strategy='mean')
    
    return impute_constant.fit_transform(df), names

# visualize data by components
def visualize_data(pipeline,dimred):
    feature_plot = list(zip(features, pipeline.named_steps[dimred].components_[0]))
    feature_plot = pd.DataFrame(data=feature_plot)
    feature_plot = pd.DataFrame(feature_plot.sort_values(1, ascending=False).iloc[0:10])
    plt.figure(figsize=(20,5))
    plt.title('Ordered by variance')
    sns.barplot(x=0, y=1, data=feature_plot, palette=sns.color_palette("cool"))
    plt.ylim(feature_plot[1].min(),feature_plot[1].max())
    plt.savefig(dimred+'-top10.png')
    plt.show()
     
    plt.figure(figsize=(20,8))
    plt.title('Component Variance')
    plt.plot(pipeline.named_steps[dimred].explained_variance_ratio_)

# Get the last recorded time
def final_time(df,row_ind):
    time = df.iloc[row_ind,1:].dropna().iloc[-2]
    response = df.iloc[row_ind,-1]
    return time,response

ms_mcc = make_scorer(matthews_corrcoef)
test_per = .3

In [120]:
%%time
last_time =[]
for i in range(len(mf_date_data.iloc[:,1:-1])):
    try:
        lt, sc = final_time(mf_date_data,i)
        last_time.append(lt)
    except:
        last_time.append(0)
last_time = np.array(last_time)
mn, mx = last_time.min(), last_time.max()
last_time = ((last_time - mn) / (mx - mn)) * 2 - 1
mf_num_data.insert(1, 'end_time', last_time)

Wall time: 55.3 s


### C. Dimensionality Reduction

#### 1. Decomposition

In [40]:
X, features = process_data(mf_num_data.iloc[:,1:-1], False)
y = mf_num_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=test_per, random_state=42)

In [57]:
N_COMPONENTS = [700, len(X[0,:])]
RF_DEPTH = [150]
RF_ESTIM =  [150]

pipe = Pipeline([('reduce_dim', None), ('model', None)])
             
params = [
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': N_COMPONENTS,
        'model': [RandomForestClassifier()],
        'model__max_depth': RF_DEPTH,
        'model__n_estimators': RF_ESTIM,
    },
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': N_COMPONENTS,
        'model': [LinearSVC(), SGDClassifier(), BernoulliNB()],
    }]

grid = GridSearchCV(pipe, param_grid=params, cv=3, n_jobs=4, scoring=ms_mcc, verbose=10)

In [None]:
grid.fit(X_train, y_train)

In [47]:
grid_results = pd.DataFrame(grid.cv_results_)
columns = ['param_model',
           'param_reduce_dim__n_components',
           'param_model__max_depth',
           'param_model__n_estimators',
           'mean_test_score',
           'rank_test_score',
           'mean_fit_time']
grid_results = grid_results[columns]
grid_results['param_model']=grid_results['param_model'].apply(lambda val: str(val).split('(')[0])
grid_results = grid_results.sort_values('rank_test_score')
grid_results.to_html('pca_models_evaluation_maxed_transform_model_selection.html')

In [48]:
grid_results

Unnamed: 0,param_model,param_reduce_dim__n_components,param_model__max_depth,param_model__n_estimators,mean_test_score,rank_test_score,mean_fit_time
0,RandomForestClassifier,700,150.0,150.0,0.2841,1,548.349306
1,RandomForestClassifier,968,150.0,150.0,0.279433,2,369.165934
2,LinearSVC,700,,,0.226055,3,255.989763
3,LinearSVC,968,,,0.226055,3,66.087134
5,SGDClassifier,968,,,0.178393,5,26.150956
4,SGDClassifier,700,,,0.175834,6,37.088741
6,BernoulliNB,700,,,0.102116,7,43.646047
7,BernoulliNB,968,,,0.089966,8,22.052156


#### 2. Feature Selection

In [98]:
X, features = process_data(mf_num_data.iloc[:,1:-1], False)
y = mf_num_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=test_per, random_state=42)

In [99]:
%%time
estimator = RandomForestClassifier()
selector = RFE(estimator, step=.05)
selector = selector.fit(X_train, y_train)
RFE_selection = pd.DataFrame(list(zip(mf_num_data.columns[1:-1],selector.ranking_)))

Wall time: 1min 17s


In [112]:
top_features = RFE_selection[0][RFE_selection[1] <= 1].values
X, features = process_data(mf_num_data[top_features], True)
y = mf_num_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=test_per, random_state=42)

In [121]:
rf = RandomForestClassifier(n_estimators=200, class_weight='balanced')
rf.fit(X_train, y_train)
y_predict = rf.predict(X_test)
print(matthews_corrcoef(y_test, y_predict))

0.4078209370113893


In [124]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy', n_jobs=4,class_weight='balanced', verbose=1)
rf.fit(X_train, y_train)
y_predict = rf.predict(X_test)
print(matthews_corrcoef(y_test, y_predict))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    8.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s


0.41037390395473183


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished
