In [None]:
# Notes
# Imbalanced dataset, None > Focus Spot > Scratch
# Clear differences between Focus Spot and Scratch
# Scratch has smaller area and larger length, while Focus Spot has the opposite
# Area and length are related to number of clusters

# Imbalanced dataset, might want to use neighbors = 1
# Weights = Distance, closer the better
# Uniform means the top x neighbors would have the same voting power!

#<-------20---------><-------20---------><-------20---------><-------18------->


In [22]:
import joblib
import pickle

import hdbscan
import numpy as np
import pandas as pd

from scipy.spatial import ConvexHull
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# 1. Train best classifier (Random Forest)

In [6]:
# Prepare data for modeling
model_df = pd.read_csv('model_dataset/labeled_processed_data.csv')
model_df['class'] = 'none'
model_df.loc[model_df['scratch'] == 1, 'class'] = 'scratch'
model_df.loc[model_df['focus_spot'] == 1, 'class'] = 'focus_spot'
model_df.drop(columns=['LOT_ID', 'WAFER_ID', 'WAFER_SCRIBE_ID',
                       'c_id', 'none', 'scratch', 'focus_spot'], inplace=True)
X = model_df.drop(columns='class')
y = model_df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.15,
                                                    random_state=11)

# Train model
clf = RandomForestClassifier(n_estimators=700,
                             class_weight='balanced_subsample',
                             min_samples_split=5,
                             min_samples_leaf=1,
                             max_features='auto',
                             max_depth=None,
                             bootstrap=False)
clf.fit(X_train, y_train)

# Save the model to disk
filename = 'gross_failure_classifier.sav'
pickle.dump(clf, open(filename, 'wb'))

In [7]:
# Load the model from disk
loaded_model = joblib.load(filename)

In [8]:
# Input csv from folder, cluster, calculate features, predict!!!

In [23]:
data_path = 'labeled_dataset/QAFQN156KOG7.csv'

df = pd.read_csv(data_path)
df.rename(columns={'REAL_WAFER_X': 'X', 'REAL_WAFER_Y': 'Y'}, 
          inplace=True)
df[['X', 'Y']] = df[['X', 'Y']]/1000

# Cluster with DBSCAN
clusterer = DBSCAN(eps=10, min_samples=4).fit(df[['X', 'Y']])
df['c_id'] = 1 + clusterer.labels_

# Cluster DBSCAN outliers with HDBSCAN
df_sub = df.copy()
clusterer = hdbscan.HDBSCAN(min_samples=4)
clusterer.fit(df_sub[['X', 'Y']])
df_sub['c_id'] = 1 + clusterer.labels_
df = pd.merge(df, df_sub[['X', 'Y', 'c_id']], 
              on=['X', 'Y'], suffixes=('', '_y'))
df.loc[
    (df['c_id'] == 0) & (df['c_id_y'] != 0), 'c_id'] = df.loc[
    (df['c_id'] == 0) & (df['c_id_y'] != 0),
    'c_id_y'] + df['c_id'].max()
df.drop(columns='c_id_y', inplace=True)

# Any clusters that are less than 4 samples are not considered issues
cluster_size = df['c_id'].value_counts()
df.loc[df['c_id'].map(cluster_size) <= 4, 'c_id'] = 0

def PolygonArea(xy):
    """Calculate the area of xy coordinates.
    
    Arg:
        xy (array): 2D array with x coordinates as the first column and 
            y coordinates as the second column
    Return:
        polygon_area (float): Area of the shape createdd by xy coordinates
    """
    x = xy[:, 0]
    y = xy[:, 1]
    x_ = x - x.mean()
    y_ = y - y.mean()
    correction = x_[-1]*y_[0] - y_[-1]*x_[0]
    main_area = np.dot(x_[:-1], y_[1:]) - np.dot(y_[:-1], x_[1:])
    polygon_area = 0.5*np.abs(main_area + correction)
    return polygon_area

def area(cluster_df):
    """Calculate the area of the outer xy coordinates of the cluster.
    
        Arg:
            cluster_df (dataframe): Cluster dataframe containing xy coordinate 
                columns
        Return:
            cluster_area (float): Area of the cluster created by outer xy 
                coordinates
    """
    cluster_xy = cluster_df[['X', 'Y']].values
    if cluster_xy.shape[0] < 3:
        cluster_area = 0
    else:
        if (all(cluster_xy[:, 0] == cluster_xy[1, 0]) 
            | all(cluster_xy[:, 1] == cluster_xy[1, 1])):
            cluster_area = 0
        else:
            hull = ConvexHull(cluster_xy)
            hull_pts = cluster_xy[hull.vertices, :]
            cluster_area = PolygonArea(hull_pts)
    return cluster_area

def cluster_length(cluster_df):
    """Calculate the length of the cluster.
    
    Arg:
        cluster_df (dataframe): Cluster dataframe containing xy coordinate 
            columns
    Return:
        cluster_len (float): Length of the cluster that is determined by 
            the two points furthest away from each other
    """
    cluster_xy = cluster_df[['X', 'Y']].values
    if cluster_xy.shape[0] < 2:
        cluster_len = 0
    else:
        dist_arr = np.sqrt(np.sum(cluster_xy**2,axis=1))
        coordA = cluster_xy[np.argmax(dist_arr), :]
        cdf_moved = cluster_xy[:] - coordA[:]
        dist_arr = np.sqrt(np.sum(cdf_moved**2,axis=1))
        coordB = cluster_xy[np.argmax(dist_arr), :]
        cluster_len = np.linalg.norm(coordA - coordB)
    return cluster_len

# Calculate features by wafer
num_wafer_pt_df = (df.groupby(
                   ['LOT_ID', 'WAFER_ID', 'WAFER_SCRIBE_ID'])
                         .size()
                         .reset_index())
num_wafer_pt_df.rename(columns={0: 'NUM_WAFER_PT'}, inplace=True)
cent_wafer_xy_df = (df.groupby(
                    ['LOT_ID', 'WAFER_ID', 'WAFER_SCRIBE_ID'])
                          .agg({'X': np.mean, 'Y': np.mean})
                          .reset_index())
cent_wafer_xy_df.rename(
    columns={'X': 'WAFER_CENTX', 'Y': 'WAFER_CENTY'}, inplace=True)
num_cluster_df = df.loc[df['c_id'] != 0, :].copy()
num_cluster_df = (num_cluster_df.groupby(
                  ['LOT_ID', 'WAFER_ID', 'WAFER_SCRIBE_ID'])
                                .agg({'c_id': 'nunique'})
                                .reset_index())
num_cluster_df.rename(
    columns={'c_id': 'NUM_CLUSTER'}, inplace=True)

# Combine all wafer features
wafer_feat_df = num_wafer_pt_df.merge(cent_wafer_xy_df)
wafer_feat_df = wafer_feat_df.merge(num_cluster_df, how='left')
wafer_feat_df['NUM_CLUSTER'].fillna(0, inplace=True)

# Calculate features by clusters
num_cluster_pt_df = (df.groupby(
                     ['LOT_ID', 'WAFER_ID', 'WAFER_SCRIBE_ID', 'c_id',
                      'none', 'scratch', 'focus_spot'])
                                 .size()
                                 .reset_index())
num_cluster_pt_df.rename(columns={0: 'NUM_PT'}, inplace=True)
cent_cluster_xy_df = (df.groupby(
                      ['LOT_ID', 'WAFER_ID', 'WAFER_SCRIBE_ID', 'c_id'])
                                  .agg({'X': np.mean, 'Y': np.mean})
                                  .reset_index())
cent_cluster_xy_df.rename(
    columns={'X': 'CENTX', 'Y': 'CENTY'}, inplace=True)
cent_cluster_xy_df['DIST_FROM_CENTER'] = np.sqrt(
    cent_cluster_xy_df['CENTX']**2 
    + cent_cluster_xy_df['CENTY']**2)
area_cluster_df = (df.groupby(
                   ['LOT_ID', 'WAFER_ID', 'WAFER_SCRIBE_ID', 'c_id'])
                               .apply(area)
                               .reset_index())
area_cluster_df.rename(columns={0: 'AREA'}, inplace=True)
len_cluster_df = (df.groupby(
                  ['LOT_ID', 'WAFER_ID', 'WAFER_SCRIBE_ID', 'c_id'])
                              .apply(cluster_length)
                              .reset_index())
len_cluster_df.rename(columns={0: 'LENGTH'}, inplace=True)

# Combine all cluster features
cluster_feat_df = num_cluster_pt_df.merge(cent_cluster_xy_df)
cluster_feat_df = cluster_feat_df.merge(cent_cluster_xy_df)
cluster_feat_df = cluster_feat_df.merge(area_cluster_df)
cluster_feat_df = cluster_feat_df.merge(len_cluster_df)

In [26]:
cluster_feat_df
# Need to correct this!!! c_id are all the same, but should be 4!!

Unnamed: 0,LOT_ID,WAFER_ID,WAFER_SCRIBE_ID,c_id,none,scratch,focus_spot,NUM_PT,CENTX,CENTY,DIST_FROM_CENTER,AREA,LENGTH
0,L9304630,156,QAFQN156KOG7,1,0,0,0,95,-0.008448,0.109805,0.11013,0.063826,0.29233
1,L9304630,156,QAFQN156KOG7,1,0,0,1,330,-0.008448,0.109805,0.11013,0.063826,0.29233
2,L9304630,156,QAFQN156KOG7,1,0,1,0,815,-0.008448,0.109805,0.11013,0.063826,0.29233
3,L9304630,156,QAFQN156KOG7,1,1,0,0,93,-0.008448,0.109805,0.11013,0.063826,0.29233


In [28]:
df['c_id'].unique()

array([1], dtype=int64)