# Entry 11 notebook - Consolidate Pre-processing - Computer Hardware

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import category_encoders as ce

### Custom functions

In [2]:
def split_data(df, target, train_size):
    y = df[[target]]
    X = df.drop(target, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, random_state=12)
    return X_train, X_test, y_train, y_test

def define_reserve(X_test, y_test):
    X_test, X_reserve, y_test, y_reserve = train_test_split(X_test, y_test, test_size = 0.5, random_state=12)
    return X_test, X_reserve, y_test, y_reserve

In [3]:
def feature_corr_coll(train_df, target, test_df, corr_type='spearman'):
    target_corr = pd.DataFrame(train_df.corrwith(target, axis=0, method=corr_type).reset_index()).rename(columns={0:corr_type})
    target_corr['abs'] = target_corr[corr_type].abs()
    top_features = target_corr[target_corr['abs'] >= 0.5].sort_values('abs', ascending=False)['index'].tolist()
    df_top = train_df[top_features]
    feature_corr = df_top.corr(method=corr_type)
    
    collinear_features = set()
    for i in range(len(feature_corr.columns)):
        sliced_matrix = feature_corr.iloc[i, :i]
        if sum(sliced_matrix[abs(sliced_matrix) > 0.9]):
            colname = feature_corr.columns[i]
            collinear_features.add(colname)
    collinear_features = list(collinear_features)
    df_train = df_top.drop(collinear_features, axis=1)
    
    select_features = df_train.columns.tolist()
    df_test = test_df[select_features]
    
    return df_train, df_test

In [4]:
def preprocess_data(train_df, test_df, scaler=StandardScaler(), encoder=ce.OrdinalEncoder):
    index = train_df.index.tolist()
    test_index = test_df.index.tolist()
    
    num_features = train_df.select_dtypes('number').columns.tolist()    
    num_scale = scaler.fit_transform(train_df[num_features])
    train_num_df = pd.DataFrame(num_scale, columns=num_features, index=index)
    
    test_num_scale = scaler.transform(test_df[num_features])
    test_num_df = pd.DataFrame(test_num_scale, columns=num_features, index=test_index)
    
    cat_features = train_df.select_dtypes('object').columns.tolist()
    cat_encoder = encoder(cols=cat_features)
    cat_encode = cat_encoder.fit_transform(train_df[cat_features])
    train_cat_df = pd.DataFrame(cat_encode, columns=cat_features, index=index)
    
    test_cat_encode = cat_encoder.transform(test_df[cat_features])
    test_cat_df = pd.DataFrame(test_cat_encode, columns=cat_features, index=test_index)
    
    train_df = pd.concat([train_num_df, train_cat_df], axis=1).fillna(-1)
    test_df = pd.concat([test_num_df, test_cat_df], axis=1).fillna(-1)
    
    return train_df, test_df

In [5]:
def train_and_predict(X_train, y_train, X_test, model=LinearRegression()):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return model, preds

# Get the Data

## Data

[Computer Hardware Data Set](http://archive.ics.uci.edu/ml/datasets/Computer+Hardware) from UCI Machine Learning Repository.

Number of observations: 209

Number of Attributes (all integers): 9

1. Vendor Name: 30 
(adviser, amdahl,apollo, basf, bti, burroughs, c.r.d, cambex, cdc, dec, 
dg, formation, four-phase, gould, honeywell, hp, ibm, ipl, magnuson, 
microdata, nas, ncr, nixdorf, perkin-elmer, prime, siemens, sperry, 
sratus, wang) 
2. Model Name: many unique symbols 
3. MYCT: machine cycle time in nanoseconds (integer) 
4. MMIN: minimum main memory in kilobytes (integer) 
5. MMAX: maximum main memory in kilobytes (integer) 
6. CACH: cache memory in kilobytes (integer) 
7. CHMIN: minimum channels in units (integer) 
8. CHMAX: maximum channels in units (integer) 
9. PRP: published relative performance (integer) 
10. ERP: estimated relative performance from the original article (integer)

The data is stored as a .data file and can be pulled directly from the internet using pandas' read_csv function. The handling of this data type by read_csv is inconsistent. The [Challenger USA Space Shuttle O-Ring Data Set](http://archive.ics.uci.edu/ml/datasets/Challenger+USA+Space+Shuttle+O-Ring) data loaded each row as a single string, which then had to be split and formatted. Fortunately, this dataset loaded correctly as a dataframe.

In [6]:
columns = ['vendor', 'model', 'myct', 'mmin', 'mmax', 'cach', 'chmin', 'chmax', 'prp', 'erp']
raw_df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/cpu-performance/machine.data', header=None, names=columns)
raw_df.head()

Unnamed: 0,vendor,model,myct,mmin,mmax,cach,chmin,chmax,prp,erp
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132


### Remove columns

I only want the numerical features for this series of trainings, so I'm removing vendor and model. I'm also removing erp because it looks like it's the predictions they made for the original paper.

In [7]:
df = raw_df.drop(['vendor', 'model', 'erp'], axis=1)
df.head()

Unnamed: 0,myct,mmin,mmax,cach,chmin,chmax,prp
0,125,256,6000,256,16,128,198
1,29,8000,32000,32,8,32,269
2,29,8000,32000,32,8,32,220
3,29,8000,32000,32,8,32,172
4,29,8000,16000,32,8,16,132


### Split data into train and test

In [8]:
X_train, X_test, y_train, y_test = split_data(df, 'prp', 0.6)
X_train.head()

Unnamed: 0,myct,mmin,mmax,cach,chmin,chmax
169,38,4000,16000,128,16,32
153,38,16000,32000,128,16,32
9,23,32000,64000,128,32,64
69,105,2000,4000,8,3,19
184,105,2000,6000,16,6,16


In [9]:
y_train.head()

Unnamed: 0,prp
169,212
153,510
9,1144
69,32
184,33


### Consider correlation/collinearity

There are so few features in this dataset I'm going to skip this step.

### Pre-process features

In [10]:
X_train, X_test = preprocess_data(X_train, X_test)
X_train.head()

Unnamed: 0,myct,mmin,mmax,cach,chmin,chmax
169,-0.700222,0.285124,0.482349,2.667315,2.295893,0.696589
153,-0.700222,3.045456,2.04462,2.667315,2.295893,0.696589
9,-0.753731,6.7259,5.169163,2.667315,5.314652,2.174337
69,-0.461213,-0.174932,-0.689355,-0.351258,-0.156848,0.096254
184,-0.461213,-0.174932,-0.494071,-0.150019,0.409169,-0.042285


### Split out a reserve dataset

In [11]:
X_test, X_reserve, y_test, y_reserve = define_reserve(X_test, y_test)
X_reserve.head()

Unnamed: 0,myct,mmin,mmax,cach,chmin,chmax
72,-0.211502,-0.5761,-0.884638,-0.552496,-0.156848,0.327152
196,-0.657414,-0.174932,2.04462,2.264838,9.088101,4.021522
167,-0.700222,1.205234,0.482349,1.057409,0.031824,-0.411722
77,0.23441,-0.546657,0.091781,-0.401567,-0.534193,0.327152
62,-0.389867,-0.517213,-0.98228,-0.552496,0.786514,0.142434


### Train model and predict

In [12]:
model, preds = train_and_predict(X_train, y_train, X_test)

In [13]:
preds[:5]

array([[  1.78212292],
       [499.45833348],
       [ 78.14857561],
       [315.2990307 ],
       [126.11617109]])