# Data Cleaning
Initial data cleaning and preprocessing for the project. This notebook is used to create the preprocessing pipeline in `pipeline.ipynb`. This used to clean the data for the exploratory data analysis and machine learning notebooks. It outputs a cleaned dataset to `CEIP_csv/cleaned.csv`.

#### Setup

In [1]:
# CORE
import pandas as pd
import os
import json
import numpy as np  # Numpy for numerical computations and array operations
import pandas as pd  # Pandas for data manipulation and analysis

# MACHINE LEARNING & STATISTICS 
import scipy.stats as stats  # SciPy for scientific computing and technical computing, including statistics
import sklearn as sk # Scikit-learn for machine learning and predictive modeling

# VISUALIZATION
import matplotlib.pyplot as plt  # Matplotlib for creating static, animated, and interactive visualizations
import seaborn as sns  # Seaborn for statistical data visualization built on top of Matplotlib
import plotly.express as px  # Plotly Express for creating interactive plots and charts
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
# filepaths of all of the csv data files to be analyzed

# autonest_csv = "CEIP_csv/AutoNest.csv"
# autonest_strategy_csv = "CEIP_csv/AutoNestStrategy.csv"
# material_csv = "CEIP_csv/Material.csv"
# nest_csv = "CEIP_csv/Nest.csv"
# part_csv = "CEIP_csv/Part.csv"
# performance_csv = "CEIP_csv/Performance.csv"
training_csv = "../CEIP_csv/training_data.csv"

# read in all of these csv files as pandas dataframes

# autonest_df = pd.read_csv(autonest_csv)
# autonest_strategy_df = pd.read_csv(autonest_strategy_csv)
# material_df = pd.read_csv(material_csv)
# nest_df = pd.read_csv(nest_csv)
# part_df = pd.read_csv(part_csv)
# performance_df = pd.read_csv(performance_csv)
training_df = pd.read_csv(training_csv) # takes about 1 min to read 

## Data Cleaning

#### Data Previewing

In [3]:
# count the number of null values 
# training_df.isnull().sum() # --> RESULT: no null values in any column  

# list of columns
# column_list = list(training_df.columns)
column_list = ['ixJobSummary', 'ixNest', 'ixPart', 'dPartTrueArea', 'cRequired', 
               'cNested', 'ixMaterial', 'fExtShape', 'dExtArea', 'dExtBoundaryDist', 
               'dExtContainedDist', 'dLgIntArea', 'dLgIntBoundaryDist', 'dLgIntContainedDist', 
               'dLgExtConArea', 'dLgExtConBoundaryDist', 'dLgExtConContainedDist', 'cTimesCut', 
               'dNestingTime', 'fStrategies', 'dSheetLength', 'dSheetWidth', 'dSheetArea', 'dLengthUsed', 
               'dWidthUsed', 'dPartArea', 'calcUtil', 'ixAutoNestStrategy', 'fAllPartsNested']

# count the values for the ixMaterial column in the training_df 
# training_df.ixMaterial.value_counts()

# plot the distribution of the ixMaterial column in the training_df
# only plot the 10 most common values
# training_df.ixMaterial.value_counts().nlargest(10).plot(kind='bar', figsize=(10,5))

# count the number of unique values
# training_df.nunique(axis=0)

# ixJobSummary               224892
# ixNest                     224892
# ixPart                    4200357
# dPartTrueArea              984974
# cRequired                    1534
# cNested                      1654
# ixMaterial                   7316
# fExtShape                      52
# dExtArea                   825580
# dExtBoundaryDist           159907
# dExtContainedDist          526206
# dLgIntArea                  74924
# dLgIntBoundaryDist          35721
# dLgIntContainedDist         83215
# dLgExtConArea               71156
# dLgExtConBoundaryDist       24334
# dLgExtConContainedDist      68328
# cTimesCut                     176
# dNestingTime                23997
# fStrategies                   292
# dSheetLength                 2917
# dSheetWidth                  1937
# dSheetArea                  13217
# dLengthUsed                136950
# dWidthUsed                 112688
# dPartArea                  168268
# calcUtil                   172012
# ixAutoNestStrategy             13
# fAllPartsNested                 2
# dtype: int64

In [4]:
# above shows that there are only 4.2 million unique values for ixPart 
# this indicates that there are a lot of rows that are duplicated? 

# remove duplicates 
training_df = training_df.drop_duplicates()
training_df.shape # --> (5,762,622 rows, 29 columns)

(5762622, 29)

#### Re-encoding Variables
* One-hot encode all variables that are numerical but represent categories
* Replace ixMaterial and ixAutoNestStrategy with their appropriate values from the JSON file 
* Limit to only Materials that are steel

In [5]:
# replace ixMaterial column with the values from the MaterialTypes.json file 
# read in the json file as a pandas dataframe
material_types_df = pd.read_json('../CEIP_csv/MaterialTypes.json')

# create a dictionary of the material types and their corresponding values 
material_dict = material_types_df.to_dict()

# We need to make a new dictionary where the keys are the same but the values are the sNames
sName_dict = {k: v['sName'] for k, v in material_dict.items()}

# Now we use this dictionary to replace the ixMaterial values in training_df
training_df['ixMaterial'] = training_df['ixMaterial'].map(sName_dict)

# Rename the column
training_df = training_df.rename(columns={'ixMaterial': 'Material'})

In [6]:
# Count the different material types
material_counts = training_df['Material'].value_counts()
# results: 4,072,515 rows are MS, the next most is SS with 191,961 

# count the number of unique materials
nunique_materials = training_df.Material.nunique()
print(f'There are {nunique_materials} unique materials in the dataset.')

# Convert the Series to a DataFrame
material_counts_df = material_counts.reset_index()

# Rename the columns for clarity
material_counts_df.columns = ['Material', 'Count']

# Sort the DataFrame by the 'Count' column in descending order and take the top 30 rows
top_material_counts_df = material_counts_df.sort_values('Count', ascending=False).head(100)

# Create a bar chart for the top 30 materials
fig = px.bar(top_material_counts_df, x='Material', y='Count', title='Distribution of Top 100 Material Types')
fig.show()

There are 2117 unique materials in the dataset.


In [7]:
# remove all rows that do not have mild steel (ms) as the Material

# drop all the NA values from Material, Keep only the rows where Material contains 'ms'
training_df = training_df.dropna(subset=['Material'])
training_df = training_df[training_df['Material'] == 'ms']

# Drop the Material column from the dataframe - don't need it anymore 
encoded_df = training_df.drop(columns=['Material'])

In [14]:
# re-encode the fStrategies column 

# fStrategies = int 
# This is a bitmask comprised of the following nesting strategies:

# 0x00000000 = None
# 0x00000001 = Strategy1
# 0x00000002 = Strategy2	
# 0x00000004 = Strategy3	
# 0x00000008 = Strategy4	
# 0x00000010 = Strategy5	
# 0x00000020 = Strategy6	
# 0x00000040 = Strategy7	
# 0x00000080 = Strategy8	
# 0x00000100 = Strategy9	
# 0x00000200 = Strategy10	
# 0x00000400 = Block nesting	
# 0x00000800 = Block optimization 
# 0x00001000 = IntelliNest	
# 0x00002000 = IC Profile nesting	
# 0x00004000 = IC Pattern and fill
# 0x80000000 = Manual nesting	

# convert the bitmask values in the fStrategies to the string -> one-hot-encoding 
# result: 
# or is it good to use a bitmask? what problems might that lead to? is it good? 

# ?? what does it mean when it has a strategy in fStrategies AND one in ixAutoNestStrategy? 

# !! ask Mark - can we safely assume that if fStrategies uses IntelliNest, ixAutoNestStrategy is IntelliNest? 
# What additional info does ixAutoNestStrategy give us? 

# ?? Can a given Job use both manual nesting (chosen strategies in fStrategies) AND IntelliNest? 

# can also do a classifer that tries to predict the strategy used given the other columns (part characteristics)

# next step: can you make a multi-class classifier + regression model? 

# classify the strategy -> classify the strategy that yields the optimal util, GIVEN the part characteristics

In [8]:
# Load the dictionary from the JSON file
with open('../CEIP_csv/AutoNestStrategy.json', 'r') as f:
    autoneststrategy_dict = json.load(f)
    
#  Convert ixAutoNestStrategy to string
encoded_df['ixAutoNestStrategy'] = encoded_df['ixAutoNestStrategy'].astype(str)

# Replace the ixAutoNestStrategy values in the DataFrame
encoded_df['ixAutoNestStrategy'] = encoded_df['ixAutoNestStrategy'].map(autoneststrategy_dict)

# Count the different strategies
strategy_counts = encoded_df['ixAutoNestStrategy'].value_counts()

# Create a pie chart directly from the Series
fig = px.pie(strategy_counts, values=strategy_counts.values, names=strategy_counts.index, 
             title='Distribution of Strategies Used in AutoNest')
fig.show()

In [9]:
# one hot encoding for the AutoNestStrategy to convert categorical from numeric 
# One-hot encode ixAutoNestStrategy
encoded_df = pd.get_dummies(encoded_df, columns=['ixAutoNestStrategy'], prefix='')

# Prepare a dictionary for renaming the columns
rename_dict = {f'_{k}': v for k, v in autoneststrategy_dict.items()}

# Rename the columns
encoded_df.rename(columns=rename_dict, inplace=True)

encoded_df.head()

Unnamed: 0,ixJobSummary,ixNest,ixPart,dPartTrueArea,cRequired,cNested,fExtShape,dExtArea,dExtBoundaryDist,dExtContainedDist,...,_Strategy_1,_Strategy_10,_Strategy_2,_Strategy_3,_Strategy_4,_Strategy_5,_Strategy_6,_Strategy_7,_Strategy_8,_Strategy_9
0,304409,746165,2377314,151.1253,5,5,0,151.1253,4.7891,15.8795,...,0,0,0,0,1,0,0,0,0,0
1,304409,746165,2377315,49.5807,10,10,0,49.5807,3.1157,9.9701,...,0,0,0,0,1,0,0,0,0,0
3,304409,746165,2377316,222.0186,5,5,0,222.0186,4.7891,24.3234,...,0,0,0,0,1,0,0,0,0,0
5,304409,746165,2377317,2.5008,120,120,0,2.5008,0.6184,2.7209,...,0,0,0,0,1,0,0,0,0,0
7,304409,746165,2377318,11.5773,55,55,0,11.5773,0.906,5.9537,...,0,0,0,0,1,0,0,0,0,0


In [10]:
# summarize the encoded df with the summary statistics
encoded_df.describe()

Unnamed: 0,ixJobSummary,ixNest,ixPart,dPartTrueArea,cRequired,cNested,fExtShape,dExtArea,dExtBoundaryDist,dExtContainedDist,...,_Strategy_1,_Strategy_10,_Strategy_2,_Strategy_3,_Strategy_4,_Strategy_5,_Strategy_6,_Strategy_7,_Strategy_8,_Strategy_9
count,4072515.0,4072515.0,4072515.0,4072515.0,4072515.0,4072515.0,4072515.0,4072515.0,4072515.0,4072515.0,...,4072515.0,4072515.0,4072515.0,4072515.0,4072515.0,4072515.0,4072515.0,4072515.0,4072515.0,4072515.0
mean,1899243.0,4511965.0,15802080.0,-7.351136e+286,13.14087,10.60778,5.439213,654.0658,5.553442,35.77339,...,0.07025364,0.08826732,0.01309068,0.02128144,0.2868547,0.03062653,0.01107473,0.0105635,0.01534973,0.0142676
std,1042945.0,2441282.0,8736020.0,inf,2759.251,133.832,10.45352,11777.17,29.78956,51.34109,...,0.255574,0.2836833,0.1136632,0.144321,0.4522932,0.1723037,0.1046522,0.1022346,0.1229395,0.1185919
min,35601.0,100763.0,304780.0,-3.742202e+292,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1014986.0,2468597.0,8436690.0,31.5375,1.0,1.0,0.0,32.6625,1.8988,9.6224,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1905288.0,4519524.0,15938770.0,105.8387,2.0,2.0,0.0,110.5994,3.3955,19.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2794505.0,6605744.0,23292710.0,420.1447,5.0,5.0,5.0,438.2475,6.75,42.1407,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,3688756.0,8711716.0,30970140.0,2.960661e+157,5555555.0,163231.0,54.0,12316300.0,56736.66,32390.1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Other data cleaning we might do

In [11]:
# Convert a numeric variable to categorical using custom ranges
# bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
# labels = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100']
# df['age_group'] = pd.cut(df['age'], bins=bins, labels=lab

In [12]:
# output the results as a cleaned csv 
encoded_df.to_csv('../CEIP_csv/cleaned.csv', index=False)

#### Downsampling Data
Downsampling data to ~1,000,000 samples for early machine learning & analysis