# This project examines patterns in manufacturing defects

Data is provided by Bosch

## Import and examine the data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

tic = time.process_time()

training set has 1183747 records

test set has 1023747 records

In [2]:
folder = 'Bosch-Production-Line-Performance/'
batch_size = 20000
chunksize = 20000
train_num_records = 1183747

skiprows = list(range(1,train_num_records))
np.random.shuffle(skiprows)
skiprows = np.array(skiprows)[batch_size:]

# Let's make this dataset somewhat balanced:

### First, let's get the FAIL rows:

INITIALLY: Because there's so few, I'll have to scour the entire dataset and get all of them for there to be enough of them

THEN:  I saved the fails as a new CSV so I can load them more quickly in the future, rather than having to scour entire datasets again

In [3]:
train_numeric_fail = pd.read_csv(folder+'train_numeric_fail.csv', low_memory=False)
train_categorical_fail = pd.read_csv(folder+'train_categorical_fail.csv', low_memory=False)
train_date_fail = pd.read_csv(folder+'train_date_fail.csv', low_memory=False)

print(train_numeric_fail.shape)
print(train_categorical_fail.shape)
print(train_date_fail.shape)

(6879, 970)
(6879, 2141)
(6879, 1157)


### Second, let's get the PASS rows:

This is much easier, I'll just grab some random rows and remove the fails

In [4]:
%%time

train_numeric_pass = pd.read_csv(folder+'train_numeric.csv', skiprows=skiprows)
train_categorical_pass = pd.read_csv(folder+'train_categorical.csv', skiprows=skiprows, low_memory=False)
train_date_pass = pd.read_csv(folder+'train_date.csv', skiprows=skiprows)


train_numeric_pass.drop(train_numeric_pass[train_numeric_pass['Response']==1].index, inplace=True)
train_categorical_pass = train_categorical_pass[train_categorical_pass['Id'].isin(train_numeric_pass['Id'])]
train_date_pass = train_date_pass[train_date_pass['Id'].isin(train_numeric_pass['Id'])]


print(train_numeric_pass.shape)
print(train_categorical_pass.shape)
print(train_date_pass.shape)

(19883, 970)
(19883, 2141)
(19883, 1157)
CPU times: user 42 s, sys: 4.1 s, total: 46.1 s
Wall time: 47.5 s


### Third, let's merge the PASS and FAIL into a simple SAMPLE dataframe

Recall that we did Pass and Fail separately because Fails were so hard to find

In [5]:
train_categorical_sample = pd.concat([train_categorical_pass, train_categorical_fail])
train_numeric_sample = pd.concat([train_numeric_pass, train_numeric_fail])
train_date_sample = pd.concat([train_date_pass, train_date_fail])

print(train_categorical_sample.shape)
print(train_numeric_sample.shape)
print(train_date_sample.shape)

(26762, 2141)
(26762, 970)
(26762, 1157)


### Fourth, let's create the FEATURE dataframe by merging NUMERIC and CATEGORICAL

Could be useful to simplify

In [6]:
# merge categorical and numeric together for total train_features_sample DataFrame
train_feature_sample = pd.merge(train_categorical_sample, train_numeric_sample,on='Id')

print(train_feature_sample.shape)

(26762, 3110)


### And finally, let's delete the PASS and FAIL dataframes, since they're no more use to us

Only useful for balancing the dataset, which we now have with the SAMPLE dataframes

In [7]:
del train_categorical_fail, train_numeric_fail, train_date_fail
del train_categorical_pass, train_numeric_pass, train_date_pass

#del chunk

## Let's now start to get combinations of stations

Another notebook gets just the adjacent stations/features/dates.   This will get ALL OUT combinations.  So it'll be massive.

Thus, I'll start with stations first, because there are only 51 stations

### First, create some functions to rename the columns

In [8]:
def extractLSF(string, isDate=False):
    '''
    Given a string in the Bosch Line/Station/Feature format (eg: L0_S0_F26), extract the Line number, Station number, and Feature number
    
    Input: String in Lx_Sy_Fz format
    
    Output:  Tuple of (x, y, z)
    
    Sidenote:  isDate will extract based on Lx_Sy_Dd format instead, returns tuple of (x, y, d)
    '''
    # first assert the format is correct:
    splitStr = string.split('_')
    if isDate:
        prefixList = ['L', 'S', 'D']
    else:
        prefixList = ['L', 'S', 'F']
        
    assert len(splitStr) == 3, 'Not 3 substrings split by "_"s!'
    for i in range(0, len(splitStr)):
        assert splitStr[i][1:].isnumeric(), 'Not numeric following the single-letter prefix!'
        assert splitStr[i][:1] == prefixList[i], 'Not the proper single-letter prefixes!  Did you use a Date format and forget to set isDate=True?'
    
    #now extract numbers:
    tempList = []
    
    for substr in splitStr:
        number = float(substr[1:])
        if number%1 == 0.0:
            tempList.append(int(number))
        else:
            tempList.append(number)
        
    return tuple(tempList)

# ----------------------------------------------------------------------------------------------------

def getListLSF(inputSeries, isDate=False):
    '''
    Given a series with fields of Bosch style Lx_Sy_Fz: Get the lines, stations, and features (or dates)
    Input:  series (row) in a Bosch-style QC matrix
    Output:  pandas dataframe of lines, stations, and features/dates
    '''
    
    for field in inputSeries.index:
        try:
            myL, myS, myF = extractLSF(field, isDate=isDate)
        except:
            continue
        listL.append(myL)
        listS.append(myS)
        listF.append(myF)
    
    return {'Lines': listL, 'Stations': listS, 'Features': listF}

# ----------------------------------------------------------------------------------------------------

def getListLSD(inputSeries):
    return getListLSF(inputSeries, isDate=True)


# All L/S/Fs------------------------------------------------------------------------------------------

def get_lsf(lsf_df, isDate=False):
    '''From a Bosch DF, extract all the Line/Station/Feature combinations in the fields
    Outputs a Dataframe of columns Line, Station, Feature'''
    if isDate:
        lsf = pd.DataFrame(columns=['Line', 'Station', 'Date'])
    else:
        lsf = pd.DataFrame(columns=['Line', 'Station', 'Feature'])
    
    for field in lsf_df.columns:
        try:
            lsf = lsf.append(pd.DataFrame([list(extractLSF(field, isDate))], columns=lsf.columns))
        except:
            continue
    
    return lsf

# ----------------------------------------------------

def dataframeSpecificColumns(data, columnname, isDate=False):
    '''Returns a dataframe of Bosch format, but reformat the columns to show ONLY feature, or ONLY station, etc
    Input: data (dataframe), columnname
    Output: dataframe with extracted feature/station/whatever'''
        
    cols = ['Id']
    cols.extend(list(get_lsf(data, isDate)[columnname]))

    if 'Response' in data.columns:
        cols.append('Response')
        
    df = data.copy()
    df.columns = cols
    
    return df

### Second, let's create the dataframe with the renamed columns

With Stations as columns... although I can change it

In [9]:
column = 'Station'

train = dataframeSpecificColumns(train_feature_sample, column)
train_numeric = dataframeSpecificColumns(train_numeric_sample, column)


#del train_date_sample
#del train_feature_sample
#del train_numeric_sample
#del train_categorical_sample

### Third, in the interest of reducing the combinations even further, let's remove the columns with low correlations with Response

I'm checking correlations (vs. Response) in 2 ways:

1) "value" actual values (for categorical, assigning values to numbers)

2) "boolean" NaN vs non-NaN

3) "norm" normalized values (for numeric)

#### First, get correlations with numeric columns

In [10]:
# 1) create "value" for numeric... scales for the numbers are based on only the columns themselves

train_numeric_value = train_numeric_sample.copy()
train_numeric_value.fillna(value=0, inplace=True)

In [11]:
# 2) creating "boolean" for numeric

train_numeric_boolean = train_numeric_sample.copy()
train_numeric_boolean[train_numeric_boolean.drop(['Id','Response'], axis=1).notnull()]=1
train_numeric_boolean.fillna(value=0, inplace=True)

In [12]:
# 3) create "norm" for numeric... so scales for the numbers are based on all columns

train_numeric_norm = train_numeric_value.copy()

from sklearn.preprocessing import normalize

train_numeric_norm = normalize(train_numeric_value)
train_numeric_norm = pd.DataFrame(train_numeric_norm, columns=train_numeric_value.columns, index=train_numeric_value.index)
train_numeric_norm['Id'] = train_numeric_value['Id']
train_numeric_norm['Response'] = train_numeric_value['Response']

In [13]:
numericValueCorr = train_numeric_value.corrwith(train_numeric_value['Response'], axis=0)
numericValueCorr = numericValueCorr[(numericValueCorr>0.1) | (numericValueCorr<-0.1)]

numericBooleanCorr = train_numeric_boolean.corrwith(train_numeric_boolean['Response'], axis=0)
numericBooleanCorr = numericBooleanCorr[(numericBooleanCorr>0.1) | (numericBooleanCorr<-0.1)]

numericNormCorr = train_numeric_norm.corrwith(train_numeric_norm['Response'], axis=0)
numericNormCorr = numericNormCorr[(numericNormCorr>0.1) | (numericNormCorr<-0.1)]

print(numericValueCorr)
print(numericBooleanCorr)
del numericNormCorr #don't print numericNormCorr, there's no good correlations at all...

L1_S24_F1604   -0.101818
L1_S24_F1632   -0.101441
L1_S24_F1695   -0.114737
L1_S24_F1723   -0.130767
L1_S24_F1846   -0.115725
L3_S29_F3351   -0.124732
L3_S29_F3458   -0.124732
L3_S29_F3464    0.107424
L3_S29_F3470    0.107424
Response        1.000000
dtype: float64
L3_S32_F3850    0.271615
L3_S33_F3855   -0.211041
L3_S33_F3857   -0.211041
L3_S33_F3859   -0.211041
L3_S33_F3861   -0.211041
L3_S33_F3863   -0.211041
L3_S33_F3865   -0.211041
L3_S33_F3867   -0.211041
L3_S33_F3869   -0.211041
L3_S33_F3871   -0.211041
L3_S33_F3873   -0.211041
L3_S34_F3876   -0.176664
L3_S34_F3878   -0.176664
L3_S34_F3880   -0.176664
L3_S34_F3882   -0.176664
Response        1.000000
dtype: float64


#### Second, get correlations with categorical columns

I'll have to factorize these values

In [14]:
%%time
# 1) create "value" for categorical... I'll have to factorize these numbers (column-encoded or whole-dataframe-encoded)

train_categorical_factorize = train_categorical_sample.drop('Id',axis=1).apply(pd.factorize, axis=1)

train_categorical_value = pd.DataFrame([list(item[0]) for item in train_categorical_factorize])
train_categorical_value.insert(loc=0, value=np.array(train_categorical_sample['Id']), column='Id')
train_categorical_value.columns = train_categorical_sample.columns
train_categorical_value['Response'] = np.array(train_numeric_value['Response'])

train_categorical_value.shape

CPU times: user 3min 8s, sys: 4.6 s, total: 3min 12s
Wall time: 3min 34s


In [15]:
# 2) creating "boolean" for categorical

train_categorical_boolean = train_categorical_value.copy()
train_categorical_boolean[train_categorical_value.drop('Id', axis=1)>-1]=1
train_categorical_boolean[train_categorical_value.drop('Id', axis=1)==-1]=0
train_categorical_boolean['Response'] = np.array(train_numeric_value['Response'])

In [16]:
categoricalValueCorr = train_categorical_value.corrwith(train_categorical_value['Response'], axis=0)
categoricalValueCorr = categoricalValueCorr[(categoricalValueCorr>0.1) | (categoricalValueCorr<-0.1)]

categoricalBooleanCorr = train_categorical_boolean.corrwith(train_categorical_boolean['Response'], axis=0)
categoricalBooleanCorr = categoricalBooleanCorr[(categoricalBooleanCorr>0.1) | (categoricalBooleanCorr<-0.1)]

print(categoricalValueCorr)
print(categoricalBooleanCorr)

L3_S32_F3851    0.268391
L3_S32_F3854    0.265568
Response        1.000000
dtype: float64
L3_S32_F3851    0.276364
L3_S32_F3854    0.276364
Response        1.000000
dtype: float64


#### Third, get correlations with date columns

In [17]:
# 1) create "value" for date... scales for the numbers are based on only the timestamps so it SHOULD be consistent throughout whole dataframe

train_date_value = train_date_sample.copy()
train_date_value.fillna(value=0, inplace=True)
train_date_value['Response'] = train_numeric_sample['Response']

In [18]:
# 2) creating "boolean" for date

train_date_boolean = train_date_sample.copy()
train_date_boolean[train_date_boolean.drop('Id', axis=1).notnull()]=1
train_date_boolean.fillna(value=0, inplace=True)
train_date_boolean['Response'] = train_numeric_sample['Response']

In [19]:
dateValueCorr = train_date_value.corrwith(train_date_value['Response'], axis=0)
dateValueCorr = dateValueCorr[(dateValueCorr>0.1) | (dateValueCorr<-0.1)]

dateBooleanCorr = train_date_boolean.corrwith(train_date_boolean['Response'], axis=0)
dateBooleanCorr = dateBooleanCorr[(dateBooleanCorr>0.1) | (dateBooleanCorr<-0.1)]

print(dateValueCorr)
print(dateBooleanCorr)

L3_S32_D3852    0.225223
L3_S33_D3856   -0.162181
L3_S33_D3858   -0.162181
L3_S33_D3860   -0.162181
L3_S33_D3862   -0.162181
L3_S33_D3864   -0.162181
L3_S33_D3866   -0.162181
L3_S33_D3868   -0.162181
L3_S33_D3870   -0.162181
L3_S33_D3872   -0.162181
L3_S33_D3874   -0.162181
L3_S34_D3875   -0.145119
L3_S34_D3877   -0.145119
L3_S34_D3879   -0.145119
L3_S34_D3881   -0.145119
L3_S34_D3883   -0.145119
Response        1.000000
dtype: float64
L3_S32_D3852    0.271615
L3_S33_D3856   -0.211041
L3_S33_D3858   -0.211041
L3_S33_D3860   -0.211041
L3_S33_D3862   -0.211041
L3_S33_D3864   -0.211041
L3_S33_D3866   -0.211041
L3_S33_D3868   -0.211041
L3_S33_D3870   -0.211041
L3_S33_D3872   -0.211041
L3_S33_D3874   -0.211041
L3_S34_D3875   -0.176664
L3_S34_D3877   -0.176664
L3_S34_D3879   -0.176664
L3_S34_D3881   -0.176664
L3_S34_D3883   -0.176664
Response        1.000000
dtype: float64


### Now I have narrowed down the columns to high correlation columns:

And I can start working with combinations now!  FEATURE ENGINEERING!!!

In [22]:
boolCorr = pd.concat([numericBooleanCorr, categoricalBooleanCorr, dateBooleanCorr]).drop('Response').abs().sort_values(ascending=False)

In [23]:
boolCorr

L3_S32_F3854    0.276364
L3_S32_F3851    0.276364
L3_S32_D3852    0.271615
L3_S32_F3850    0.271615
L3_S33_D3856    0.211041
L3_S33_F3855    0.211041
L3_S33_F3871    0.211041
L3_S33_F3873    0.211041
L3_S33_F3865    0.211041
L3_S33_F3863    0.211041
L3_S33_F3861    0.211041
L3_S33_D3874    0.211041
L3_S33_F3859    0.211041
L3_S33_F3857    0.211041
L3_S33_F3869    0.211041
L3_S33_F3867    0.211041
L3_S33_D3858    0.211041
L3_S33_D3860    0.211041
L3_S33_D3862    0.211041
L3_S33_D3864    0.211041
L3_S33_D3866    0.211041
L3_S33_D3868    0.211041
L3_S33_D3870    0.211041
L3_S33_D3872    0.211041
L3_S34_D3875    0.176664
L3_S34_F3882    0.176664
L3_S34_F3876    0.176664
L3_S34_F3878    0.176664
L3_S34_F3880    0.176664
L3_S34_D3879    0.176664
L3_S34_D3881    0.176664
L3_S34_D3877    0.176664
L3_S34_D3883    0.176664
dtype: float64

It looks like from BoolCorr that I can use S34, S33, S32_F3850, and S32_F3851 as columns to test combinations of.

So let's get every combination of these 4 columns

In [25]:
from itertools import combinations

In [37]:
combosize = 2

boolCols = ['L3_S34_F3882', 'L3_S33_F3855', 'L3_S32_F3850', 'L3_S32_F3851']

In [32]:
list(combinations(boolCols, combosize))

[('S34_F3882', 'S33_F3855'),
 ('S34_F3882', 'S32_F3850'),
 ('S34_F3882', 'S32_F3851'),
 ('S33_F3855', 'S32_F3850'),
 ('S33_F3855', 'S32_F3851'),
 ('S32_F3850', 'S32_F3851')]

In [51]:
def getComboColBooleanDf(df, colList, combosize):
    '''Returns a version of DF that has columns that are Combinations of the original DF columns
    colList = list of columns to be in combinations
    comboSize = combination size
    '''
    from itertools import combinations
        
    df = df[colList]
    combos = list(combinations(colList, combosize))
        
    comboDf = pd.DataFrame(np.zeros((df.shape[0], len(combos))))
    comboDf.columns = [s

In [54]:
test = train_feature_sample.drop(['Id', 'Response'], axis=1)


getComboColBooleanDf(test, boolCols, 3)

MemoryError: 

# --------------------------------Testing Combos Codes-----------------------------------

In [None]:
def getListFrequency(item, listOfItems):
    count = 0
    for i in listOfItems:
        if i == item:
            count += 1
    return count

def getConnectionFrequency(inputList, allConnections):
    
    frequencies = np.zeros(len(allConnections))

    for i in range(0, len(allConnections)):
        item = allConnections[i]
        frequencies[i] = getListFrequency(item, inputList)#number of allConnections[i] in inputList
    
    return frequencies.reshape(1,len(allConnections))

In [None]:
%%time
# This is when I can enter the Connection Combo Size that I want to test

comboSize = 1
wantUniqueConnections = True
#-------------------------------------

def getConnections_Custom_Uniques(inputSeries):
    return getConnections(inputSeries, combosize=comboSize, unique=wantUniqueConnections, stringOutput=True)

seriesConnections = train_feature_sample_stationcols.apply(getConnections_Custom_Uniques, axis=1)

In [None]:
%%time
#TESTING
# Creating a SET of all connections for the columns for my Connections DF... then converting to a list

allConnections = set(())

for product in seriesConnections:
    allConnections.update(product)

allConnections = list(allConnections)

In [None]:
%%time
connectionFrequency = pd.DataFrame(np.zeros((seriesConnections.size, len(allConnections))), columns=pd.Series(allConnections), index=seriesConnections.index)

for row in seriesConnections.index:
    connectionFrequency.loc[row] = getConnectionFrequency(seriesConnections[row], allConnections)

results = train_feature_sample_stationcols['Response']

print(connectionFrequency.head())

# Dimensionality Reduction #1

### Principle Component Analysis

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaledData = scaler.fit_transform(connectionFrequency)

In [None]:
from sklearn.decomposition import PCA

In [None]:
n_components = 2

pca = PCA(n_components=n_components)
pca.fit(scaledData)

scaledData.shape

In [None]:
x_pca = pca.transform(scaledData)

x_pca.shape #I just reduced MANY dimensions to just 2!

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=results,cmap='cool', s=15, alpha=0.5)
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')

# Dimensionality Reduction #2

### Independent Component Analysis

In [None]:
from sklearn.decomposition import FastICA

In [None]:
n_components = 2

ica = FastICA(n_components=n_components)
ica.fit(scaledData)

scaledData.shape

In [None]:
x_ica = ica.transform(scaledData)

x_ica.shape

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x_ica[:,0],x_ica[:,1],c=results,cmap='cool', s=15, alpha=0.5)
plt.xlabel('First Independent Component')
plt.ylabel('Second Independent Component')

# Dimensionality Reduction #3

### t-Distributed Stochastic Neighbor Embedding (t-SNE)

https://www.analyticsvidhya.com/blog/2018/08/dimensionality-reduction-techniques-python/

In [None]:
from sklearn.manifold import TSNE

In [None]:
n_iter = 300

tsne = TSNE(n_components=2, n_iter=n_iter).fit(scaledData)

In [None]:
tsne = TSNE(n_components=2, n_iter=n_iter).fit_transform(scaledData)

In [None]:
plt.figure(figsize=(12,8))
plt.title('t-SNE components')
plt.scatter(tsne[:,0], tsne[:,1])
plt.scatter(tsne[:,1], tsne[:,0])

# Dimensionality Reduction #4

### Uniform Manifold Approximation and Projection (UMAP)

# ------------------------------Random Forest --------------------------------------------

In [None]:
from sklearn.model_selection import train_test_split

(X_train, X_test, y_train, y_test) = train_test_split(x_pca, results, test_size=0.20)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
tree = RandomForestClassifier(n_estimators=60)
tree.fit(X_train, y_train)

In [None]:
importances = tree.feature_importances_

indices = np.argsort(importances[-9:]) #top 10 features

In [None]:
features = connectionFrequency.columns

plt.title('Feature Importances')

plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')

In [None]:
y_predict_tree = tree.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
print(confusion_matrix(y_test, y_predict_tree))
print(classification_report(y_test, y_predict_tree))

# ----------------------------------Support Vector Machines-----------------------------

In [None]:
from sklearn.svm import SVC

In [None]:
svm = SVC()
svm.fit(X_train, y_train)

In [None]:
y_predict_svm = svm.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_predict_svm))
print(classification_report(y_test, y_predict_svm))