In [1]:
from utils.mf_dimensionality_reduction import *

# __<font color='blue'>Bosch Manufacturing</font>__

# __<font color='darkblue'> Part 4: Dimensionality Reduction</font>__

__Author:__ [Ryan Harper](www.kimrharper.com) <br><br>
__Data Source:__ [Bosch Dataset via Kaggle](https://www.kaggle.com/c/bosch-production-line-performance/data) <br> <br>
__Background:__ Bosch is a home appliance and industrial tools manufacturing company. In 2017, Bosch supplied Kaggle.com with manufacturing data to promote a competition. The goal of the competition was to determine factors that influence whether or not the product passes the final response stage of manufacturing and to predict which products are likely to fail based on this manufacturing process.<br> <br>
__The Data:__ Early exploration of this data will use a subset of the big data provided by Bosch. The data is provided by [Hitesh, John, and Matthew via PDX Data Science Meetup](https://www.meetup.com/Portland-Data-Science-Group/events/257370691/). The data subset is divided into 2 groups of 3 files (3 training, 3 test). Each group has one csv file each for numerical features ('numeric'), dates ('date'), and the manufacturing path ('cat'). The data subset includes a larger percentage of products that failed the response test, but not much more is known about this subsampling method.<br><br>
__Assumptions:__ ID # represents a specific product and that there is only one product. The differences in assembly are due to customization and/or differences between lines.<br><br>
__Goal:__ Predict which products will fail the response test. <br><br>

----

# 1. Dimensionality Reduction

### A. Import Data

In [2]:
merged_df = mf_num_data.append(mf_date_data)

In [3]:
%store -r skewed_features

### B. Functions and Declarations

In [5]:
mf_num_data.head()

Unnamed: 0,Id,L0_S0_F0,L0_S0_F2,L0_S0_F4,L0_S0_F6,L0_S0_F8,L0_S0_F10,L0_S0_F12,L0_S0_F14,L0_S0_F16,...,L3_S50_F4245,L3_S50_F4247,L3_S50_F4249,L3_S50_F4251,L3_S50_F4253,L3_S51_F4256,L3_S51_F4258,L3_S51_F4260,L3_S51_F4262,Response
0,23,,,,,,,,,,...,,,,,,,,,,0.0
1,71,-0.167,-0.168,0.276,0.33,0.074,0.161,0.052,0.248,0.163,...,,,,,,,,,,0.0
2,76,,,,,,,,,,...,,,,,,,,,,0.0
3,86,-0.003,0.041,-0.033,-0.016,0.074,0.161,0.0,-0.072,0.025,...,,,,,,,,,,0.0
4,97,,,,,,,,,,...,,,,,,,,,,0.0


### C. Dimensionality Reduction

#### __PCA__

In [139]:
column_names = ['param_model','param_reduce_dim__n_components','param_model__max_depth','param_model__n_estimators','mean_test_score','rank_test_score','mean_fit_time']
display_results(grid, columns, 'visuals/pca_models_fixed_imbalance.html')

Unnamed: 0,param_model,param_reduce_dim__n_components,param_model__max_depth,param_model__n_estimators,mean_test_score,rank_test_score,mean_fit_time
11,RandomForestClassifier,,,,0.25519,1,54.974408
1,RandomForestClassifier,200.0,150.0,150.0,0.241637,2,187.838954
0,RandomForestClassifier,100.0,150.0,150.0,0.22365,3,148.713018
8,LinearSVC,,,,0.20932,4,21.45937
3,LinearSVC,200.0,,,0.18307,5,32.930122
9,SGDClassifier,,,,0.159808,6,0.655708
5,SGDClassifier,200.0,,,0.136866,7,13.815488
2,LinearSVC,100.0,,,0.131329,8,24.683609
6,GaussianNB,100.0,,,0.05638,9,7.358641
7,GaussianNB,200.0,,,0.052989,10,6.562519


# 2. Feature Selection

#### __RFE__

__Recreate Train/Test Data:__

In [127]:
X_train, X_test, y_train, y_test = process_data(mf_num_data.iloc[:,sig_diff_list], False)

In [99]:
%%time
estimator = RandomForestClassifier()
selector = RFE(estimator, step=.05)
selector = selector.fit(X_train, y_train)
RFE_selection = pd.DataFrame(list(zip(mf_num_data.columns[1:-1],selector.ranking_)))

Wall time: 1min 17s


In [112]:
top_features = RFE_selection[0][RFE_selection[1] <= 1].values
X, features = process_data(mf_num_data[top_features], True)
y = mf_num_data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=test_per, random_state=42)

__RF Run 1:__

In [121]:
rf = RandomForestClassifier(n_estimators=200, class_weight='balanced')
rf.fit(X_train, y_train)
y_predict = rf.predict(X_test)
print(matthews_corrcoef(y_test, y_predict))

0.4078209370113893


__RF Run 2:__

In [124]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy', n_jobs=4,class_weight='balanced', verbose=1)
rf.fit(X_train, y_train)
y_predict = rf.predict(X_test)
print(matthews_corrcoef(y_test, y_predict))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    8.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s


0.41037390395473183


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


__RF Run 3:__

In [124]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy', n_jobs=4,class_weight='balanced', verbose=1)
rf.fit(X_train, y_train)
y_predict = rf.predict(X_test)
print(matthews_corrcoef(y_test, y_predict))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.6s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    8.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s


0.41037390395473183


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


# 3. Feature Addition

In [6]:
merged_df = mf_num_data.append(mf_date_data)
# merged_df.isna().any()[lambda x: x]
merged_df = merged_df.fillna(merged_df.mean())
drop_nans = list(merged_df.isna().any()[lambda x: x].index)
merged_df.drop(drop_nans, axis=1, inplace=True)

In [None]:
X_train, X_test, y_train, y_test = process_data(merged_df, True)

In [149]:
estimator = RandomForestClassifier(criterion='entropy', n_jobs=4,class_weight='balanced')
selector = RFE(estimator, n_features_to_select=100, step=20)
selector = selector.fit(X_train, y_train)
RFE_selection = pd.DataFrame(list(zip(mf_num_data.columns[1:-1],selector.ranking_)))

In [182]:
get_probs = []
for i in range(1,20):
    top_features = RFE_selection[0][RFE_selection[1] == i].values
    top_features2 = list(top_features)
    top_features2.append('Response')
    X_train, X_test, y_train, y_test = process_data(merged_df[top_features2], True)
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    y_predict = rf.predict(X_test)
    print(matthews_corrcoef(y_test, y_predict))
    get_probs.append(rf.predict_proba(X_test))

0.10043317273403868
0.0056352459675123225
0.04274652775491485
0.046288084158344525
0.05516374148211885
0.059436897085631143
0.028117240709158435
0.0838616050763581
0.027989863995858334
0.055090995256066796
0.06186097549568627
0.03586270849175873
0.047463389098834075
0.03356427920727333
0.0305565362706007
0.025231454159691104
0.045266019416444085
0.05235087446168564
0.05517925734363526


In [213]:
newDF = pd.DataFrame()
for r in range(len(get_probs)):
    r_prob = get_probs[r]
    newDF[r] = r_prob[:,0]
    success_prob = newDF.mean(axis=1)
    
    r_prob = get_probs[r]
    newDF[r] = r_prob[:,1]
    fail_prob = newDF.mean(axis=1)

make_predictions = pd.DataFrame([success_prob, fail_prob]).T

In [233]:
def compare(a,b):
    return 0 if a > b else 1

In [235]:
results = [compare(make_predictions.iloc[i,:][0],make_predictions.iloc[i,:][1]) for i in range(len(make_predictions))]

In [236]:
print(matthews_corrcoef(y_test, results))

0.05212786485008331


------

__Best Prediction:__ <br>
0.41037390395473183<br>RandomForestClassifier(n_estimators=100, criterion='entropy', n_jobs=4,class_weight='balanced', verbose=1) <br>RFE(estimator, step=.05)