## Data Processing

In [50]:
%pip install imblearn
import pandas as pd
import numpy as np
from numpy import nan
import matplotlib.pyplot as plt
import os
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.feature_selection import f_classif
from imblearn.over_sampling import SMOTE
from collections import Counter

Note: you may need to restart the kernel to use updated packages.


In [51]:
#reads in the df and removes the rows that do not have a good/bad classification
df = pd.read_csv(r"C:\Users\karsa\My Drive\Extracurriculars\Research\Audax Labs\notebooks\data\compiled.csv", low_memory=False)
df = df[(df["Good/Bad"] == -1) | (df["Good/Bad"] == 1)]
df.shape

(1537, 629)

In [52]:
df = df.dropna(axis=1, thresh=1500);
df.shape

(1537, 532)

In [53]:
numeric_cols = df.select_dtypes(include=np.number).columns
numeric_cols = numeric_cols[:-1].tolist()

In [54]:
#imputes the missing values
imp = SimpleImputer(strategy='mean')
df[numeric_cols] = imp.fit_transform(df[numeric_cols])

In [55]:
#remove all columns which only have zeros
cols = []
for col in df.columns:
    if (df[col] == 0).all():
        cols.append(col)

if cols:
    df.drop(columns=cols, inplace=True)
    print(len(cols))

112


In [56]:
numerical_columns=df.columns[df.dtypes!='object']
categorical_columns=df.columns[df.dtypes=='object']
print("Numerical columns:",numerical_columns)
print('Categorical Columns:',categorical_columns)

Numerical columns: Index(['Sensor-1', 'Sensor-2', 'Sensor-3', 'Sensor-4', 'Sensor-5', 'Sensor-6',
       'Sensor-7', 'Sensor-8', 'Sensor-9', 'Sensor-10',
       ...
       'Sensor-577', 'Sensor-578', 'Sensor-583', 'Sensor-584', 'Sensor-585',
       'Sensor-587', 'Sensor-588', 'Sensor-589', 'Sensor-590', 'Good/Bad'],
      dtype='object', length=416)
Categorical Columns: Index(['Unnamed: 0', 'Sensor-332', 'Sensor-576', 'Sensor-586'], dtype='object')


In [57]:
df[categorical_columns].describe()

Unnamed: 0.1,Unnamed: 0,Sensor-332,Sensor-576,Sensor-586
count,1537,1533.0,1537.0,1536.0
unique,1537,309.0,584.0,1473.0
top,Wafer-501,0.1017,0.0862,2.7619
freq,1,26.0,18.0,3.0


In [58]:
df.iloc[:, -1:].value_counts()
#df = df.dropna(subset=[df.columns[-1]])
#df.shape

Good/Bad
-1.0        1448
 1.0          89
dtype: int64

In [59]:
df = df[numerical_columns]

In [60]:
X = df.drop('Good/Bad', axis=1)
y = df['Good/Bad']

In [61]:
print('Before', y.value_counts())
smt = SMOTE()
X_resampled, y_resampled = smt.fit_resample(X, y)
print('After', y_resampled.value_counts())

Before -1.0    1448
 1.0      89
Name: Good/Bad, dtype: int64
After -1.0    1448
 1.0    1448
Name: Good/Bad, dtype: int64


In [62]:
df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='Good/Bad')], axis=1)

In [63]:
X = df.drop('Good/Bad', axis=1)
y = df['Good/Bad']

In [64]:
def get_unique_numbers(numbers):

    list_of_unique_numbers = []

    unique_numbers = set(numbers)

    for number in unique_numbers:
        list_of_unique_numbers.append(number)

    return list_of_unique_numbers

In [65]:
print(X.shape)
print(y.shape)

(2896, 415)
(2896,)


In [94]:
correlations = df.corr()
target_correlation = correlations['Good/Bad'].sort_values(ascending=False)
high_corr_columns = target_correlation.iloc[1:][:10].index
high_corr_columns

Index(['Sensor-57', 'Sensor-134', 'Sensor-76', 'Sensor-28', 'Sensor-164',
       'Sensor-369', 'Sensor-108', 'Sensor-81', 'Sensor-449', 'Sensor-319'],
      dtype='object')

In [95]:
#creating the new df
df = df[high_corr_columns]
df["Good/Bad"] = y 
df.shape

(2896, 11)

In [96]:
df["Good/Bad"].value_counts()

-1.0    1448
 1.0    1448
Name: Good/Bad, dtype: int64

In [97]:
df.to_csv(r"C:\Users\karsa\My Drive\Extracurriculars\Research\Audax Labs\notebooks\data\preprocessed.csv", encoding='utf-8', index=False)