In [None]:
import matplotlib.pyplot as plt
import numpy as np
import nil
import pandas as pd
import seaborn as sns
import time


from os import listdir
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [None]:
def forest_test(X, Y):
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=51)
    start = time.process_time()
    trained_forest = RandomForestClassifier(n_estimators=700).fit(x_train, y_train)
    print(time.process_time() - start)
    prediction_forest = trained_forest.predict(x_test)
    print(confusion_matrix(y_test, prediction_forest))
    print(classification_report(y_test, prediction_forest))

In [None]:
def read_data_from_directory(wafer_class, machine_step):
    
    # Create an empty Dataframe to store all data
    df = pd.DataFrame()
    dir_content = listdir("../Wafer_Data/" + wafer_class + "/" + machine_step + "/")
    dir_content.sort()
    cnt = 0
    for file in dir_content:
        filepath = "../Wafer_Data/" + wafer_class + "/" + machine_step + "/" + file
        print("Reading File {0}".format(file))
        df = df.append(pd.read_csv(filepath))
            
        
    df = df.groupby(['WaferID', "STEP ID"]).describe(percentiles=[])
    wafer_class_bool = nil
    if 'good' in wafer_class:
        df.insert(loc=len(df.columns), column="CLASS", value='Good')
    elif 'bad' in wafer_class:
        df.insert(loc=len(df.columns), column="CLASS", value='Bad')
    return df

In [None]:
def run_z1():
    # reading all Z1 Data
    good_z1_df = read_data_from_directory("good_wafer", "Z1_100")
    bad_z1_df = read_data_from_directory("bad_wafer", "Z1_100")

    # Creating combined dataset of both good and bad
    df = pd.DataFrame(good_z1_df.append(bad_z1_df), columns=good_z1_df.columns)

    # Removing "COUNT" Column
    col_index = len(df.columns)-1-6
    while col_index >= 0:
        print('Removing Column Number: ', col_index)
        df.drop(df.columns[col_index], axis=1, inplace=True)
        col_index = col_index - 6

    # Removing 50% Columns
    col_index = len(df.columns)-1-2
    while col_index >= 0:
        print('Removing Columns Number: ', col_index)
        df.drop(df.columns[col_index], axis=1, inplace=True)
        col_index = col_index - 5

    # Configuring the Heatmap to make it easier to see 
    plt.figure(figsize = (30,30))
    sns.heatmap(df.corr(), annot=True)
    plt.show()

    # Dividing into Inputs and Outputs and run Random Forest Classification
    x_df = df.iloc[:,:-1]
    y_df = df.iloc[:,-1]
    x_df = x_df.fillna(0)
    forest_test(x_df, y_df)

In [None]:
run_z1()