In [860]:
# Analysis of chip data; prepare in separate pipelines
import pandas as pd
import numpy as np
import os, sys
import matplotlib as plt
import seaborn as sns
# Data load
filename = 'chip_dataset.csv'
file = os.path.abspath(os.path.join(os.getcwd(),'..', filename))
df = pd.read_csv(file)
df.head()

Unnamed: 0.1,Unnamed: 0,Product,Type,Release Date,Process Size (nm),TDP (W),Die Size (mm^2),Transistors (million),Freq (MHz),Foundry,Vendor,FP16 GFLOPS,FP32 GFLOPS,FP64 GFLOPS
0,0,AMD Athlon 64 3500+,CPU,2007-02-20,65.0,45.0,77.0,122.0,2200.0,Unknown,AMD,,,
1,1,AMD Athlon 200GE,CPU,2018-09-06,14.0,35.0,192.0,4800.0,3200.0,Unknown,AMD,,,
2,2,Intel Core i5-1145G7,CPU,2020-09-02,10.0,28.0,,,2600.0,Intel,Intel,,,
3,3,Intel Xeon E5-2603 v2,CPU,2013-09-01,22.0,80.0,160.0,1400.0,1800.0,Intel,Intel,,,
4,4,AMD Phenom II X4 980 BE,CPU,2011-05-03,45.0,125.0,258.0,758.0,3700.0,Unknown,AMD,,,


In [861]:
mode = 'binary'
target_label = 'Type' if mode == 'binary' else 'Foundry'
df.drop(['Unnamed: 0', 'Product', 'Release Date'], axis = 1, inplace=True)
df.drop('Foundry' if mode == 'binary' else 'Type', axis = 1, inplace = True)

In [862]:
# Util to get NaNs
def get_null_pc(df1, out = False) :
    null_map = {'gt':{}, 'lt':{}, 'none' : {}, 'other':{}}
    null_pc_map = 1 - df1.count()/len(df1)
    if out == True :
        print(null_pc_map)
    for key, val in null_pc_map.items() :
        pc_type = 'other'
        if val == 0.0 :
            pc_type = 'none'
        elif val > 0.85 :
            pc_type = 'gt'
        elif val < 0.05 and val != 0 :
            pc_type = 'lt'

        if key not in null_map[pc_type] :
            null_map[pc_type][key] = [val]
        else :
            null_map[pc_type][key].append(val)
        
    return null_map
print(df.info())
null_pc = get_null_pc(df, out = True)
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4854 entries, 0 to 4853
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Type                   4854 non-null   object 
 1   Process Size (nm)      4845 non-null   float64
 2   TDP (W)                4228 non-null   float64
 3   Die Size (mm^2)        4139 non-null   float64
 4   Transistors (million)  4143 non-null   float64
 5   Freq (MHz)             4854 non-null   float64
 6   Vendor                 4854 non-null   object 
 7   FP16 GFLOPS            536 non-null    float64
 8   FP32 GFLOPS            1948 non-null   float64
 9   FP64 GFLOPS            1306 non-null   float64
dtypes: float64(8), object(2)
memory usage: 379.3+ KB
None
Type                     0.000000
Process Size (nm)        0.001854
TDP (W)                  0.128966
Die Size (mm^2)          0.147301
Transistors (million)    0.146477
Freq (MHz)               0.000000
Vendor

In [863]:
# First round of null drops
# Drop rows with < 5% is NaNs
print ("Dropping rows of ", list(null_pc['lt'].keys()))
df.dropna(subset = null_pc['lt'].keys(), inplace=True)

# Drop entire col if col > 85% NaNs
print ("Dropping cols : ", null_pc['gt'].keys())
if 'FP16 GFLOPS' in df.columns :
    df.drop(null_pc['gt'].keys(), axis = 1, inplace = True)
print (df.info())

Dropping rows of  ['Process Size (nm)']
Dropping cols :  dict_keys(['FP16 GFLOPS'])
<class 'pandas.core.frame.DataFrame'>
Index: 4845 entries, 0 to 4853
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Type                   4845 non-null   object 
 1   Process Size (nm)      4845 non-null   float64
 2   TDP (W)                4222 non-null   float64
 3   Die Size (mm^2)        4130 non-null   float64
 4   Transistors (million)  4137 non-null   float64
 5   Freq (MHz)             4845 non-null   float64
 6   Vendor                 4845 non-null   object 
 7   FP32 GFLOPS            1944 non-null   float64
 8   FP64 GFLOPS            1304 non-null   float64
dtypes: float64(7), object(2)
memory usage: 378.5+ KB
None


In [864]:
# Split X + Y
X = df.loc[:, list(set(df.columns) - set([target_label]))]
Y = df.loc[:, [target_label]]

In [865]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# Transform target label
le = LabelEncoder()
Y[target_label] =  le.fit_transform(Y[target_label])

# One hot encode rest of categorical data 
# #TBD: Study ordinal vs binary effect on scores
enc = OneHotEncoder(sparse_output=False)
obj_vals = list(set(X.columns) - set(X._get_numeric_data().columns))
enc.fit(X.loc[:,obj_vals])
print(enc.categories_)
enc_df = pd.DataFrame(enc.transform(X.loc[:, obj_vals]))
print(X.shape, "\n", enc_df.shape)
X.reset_index(drop=True, inplace=True)
enc_df.reset_index(drop=True, inplace=True)
X.drop(obj_vals, axis=1, inplace=True)
X = pd.concat([X,enc_df], axis=1, ignore_index=False)
print(X.info())

[array(['AMD', 'ATI', 'Intel', 'NVIDIA', 'Other'], dtype=object)]
(4845, 8) 
 (4845, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4845 entries, 0 to 4844
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Die Size (mm^2)        4130 non-null   float64
 1   Transistors (million)  4137 non-null   float64
 2   Freq (MHz)             4845 non-null   float64
 3   TDP (W)                4222 non-null   float64
 4   FP64 GFLOPS            1304 non-null   float64
 5   Process Size (nm)      4845 non-null   float64
 6   FP32 GFLOPS            1944 non-null   float64
 7   0                      4845 non-null   float64
 8   1                      4845 non-null   float64
 9   2                      4845 non-null   float64
 10  3                      4845 non-null   float64
 11  4                      4845 non-null   float64
dtypes: float64(12)
memory usage: 454.3 KB
None


In [866]:
# Fill in NA values
from sklearn.impute import KNNImputer
# Target variable is 'Type'(binary classific) & Vendor, Foundary(multi)
# They have no null values, so we can impute on whole data set before splitting
# print("Cols : ", df.info())
print("Cols now : ", X.info())
print(get_null_pc(X)['other'])
knn_imputer = KNNImputer(n_neighbors = 1)
# df = knn_imputer.fit_transform(df)
# print(df.corr)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4845 entries, 0 to 4844
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Die Size (mm^2)        4130 non-null   float64
 1   Transistors (million)  4137 non-null   float64
 2   Freq (MHz)             4845 non-null   float64
 3   TDP (W)                4222 non-null   float64
 4   FP64 GFLOPS            1304 non-null   float64
 5   Process Size (nm)      4845 non-null   float64
 6   FP32 GFLOPS            1944 non-null   float64
 7   0                      4845 non-null   float64
 8   1                      4845 non-null   float64
 9   2                      4845 non-null   float64
 10  3                      4845 non-null   float64
 11  4                      4845 non-null   float64
dtypes: float64(12)
memory usage: 454.3 KB
Cols now :  None
{'Die Size (mm^2)': [0.1475748194014448], 'Transistors (million)': [0.1461300309597523], 'TDP (W)': [0.12