In [1338]:
# Analysis of chip data; prepare in separate pipelines
import pandas as pd
import numpy as np
import os, sys
import matplotlib as plt
import seaborn as sns
# Data load
filename = 'chip_dataset.csv'
file = os.path.abspath(os.path.join(os.getcwd(),'..', filename))
df = pd.read_csv(file)
# df.head()

In [1339]:
mode = 'binary'
target_label = 'Type' if mode == 'binary' else 'Foundry'
df.drop(['Unnamed: 0', 'Product', 'Release Date'], axis = 1, inplace=True)
df.drop('Foundry' if mode == 'binary' else 'Type', axis = 1, inplace = True)

In [1340]:
# Util to get NaNs
def get_null_pc(df1, out = False) :
    gt_threshold = 0.58
    null_map = {'gt':{}, 'lt':{}, 'none' : {}, 'other':{}}
    null_pc_map = 1 - df1.count()/len(df1)
    if out == True :
        print(null_pc_map)
    for key, val in null_pc_map.items() :
        pc_type = 'other'
        if val == 0.0 :
            pc_type = 'none'
        elif val > gt_threshold :
            pc_type = 'gt'
        elif val < 0.05 and val != 0 :
            pc_type = 'lt'

        if key not in null_map[pc_type] :
            null_map[pc_type][key] = [val]
        else :
            null_map[pc_type][key].append(val)
        
    return null_map
# print(df.info())
null_pc = get_null_pc(df, out = False)
# print(df.shape)

In [1341]:
# First round of null drops
# Drop rows with < 5% is NaNs
# print ("Dropping rows of ", list(null_pc['lt'].keys()))
df.dropna(subset = null_pc['lt'].keys(), inplace=True)

# Drop entire col if col > 85% NaNs
# print ("Dropping cols : ", null_pc['gt'].keys())
if 'FP16 GFLOPS' in df.columns :
    df.drop(null_pc['gt'].keys(), axis = 1, inplace = True)
# print (df.info())

In [1342]:
# Split X + Y
X = df.loc[:, list(set(df.columns) - set([target_label]))]
Y = df.loc[:, [target_label]]

In [1343]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# Transform target label
le = LabelEncoder()
Y[target_label] =  le.fit_transform(Y[target_label])

# One hot encode rest of categorical data 
# #TBD: Study ordinal vs binary effect on scores
enc = OneHotEncoder(sparse_output=False)
obj_vals = list(set(X.columns) - set(X._get_numeric_data().columns))
enc.fit(X.loc[:,obj_vals])
# print(enc.categories_)
enc_df = pd.DataFrame(enc.transform(X.loc[:, obj_vals]))
# print(X.shape, "\n", enc_df.shape)
X.reset_index(drop=True, inplace=True)
enc_df.reset_index(drop=True, inplace=True)
X.drop(obj_vals, axis=1, inplace=True)
X = pd.concat([X,enc_df], axis=1, ignore_index=False)
# print(X.info())

In [1344]:
# Fill in NA values
from sklearn.impute import KNNImputer
cols_w_nans = list(get_null_pc(X)['other'])
# print (cols_w_nans)
knn_imputer = KNNImputer(n_neighbors = 1)
X[cols_w_nans] = knn_imputer.fit_transform(X[cols_w_nans])
# print(X.info())

In [1345]:
# Outlier detection : up sampling ? down sampling ? Drop ?
import re
non_categ_cols = []
for col in list(X.columns) :
    if re.search("^[a-zA-Z]", str(col)) != None :
        non_categ_cols.append(col)
# Drop outliers which make < outlier_threshold
outlier_threshold = 6
for col in non_categ_cols :
    Q3, Q1 = X[col].quantile(0.75), X[col].quantile(0.25)
    IQR = Q3 - Q1
    threshold = 1.5
    outlier = X[(X[col] < Q1 - threshold * IQR) | (X[col] > Q3 + threshold * IQR)]
    pc_outlier = len(outlier) *100 / len(X)
    if pc_outlier < outlier_threshold and pc_outlier > 0 :
        X = X.drop(outlier.index)
    # else :
        # print (X[col].value_counts() / X[col].count() * 100)
        # No need for up/down sampling since max is 5% share

Transistors (million)
1178.0    4.660832
1400.0    4.660832
106.0     2.407002
4800.0    2.363239
1300.0    1.991247
            ...   
8.0       0.021882
463.0     0.021882
49.0      0.021882
420.0     0.021882
198.0     0.021882
Name: count, Length: 178, dtype: float64
Freq (MHz)
2000.0    2.975930
2400.0    2.954048
300.0     2.800875
500.0     2.800875
2200.0    2.691466
            ...   
515.0     0.021882
795.0     0.021882
1177.0    0.021882
1176.0    0.021882
416.0     0.021882
Name: count, Length: 473, dtype: float64


In [1346]:
# Transform data : Standardize/Normalize