In [1]:
import pandas as pd
import numpy as np

In [2]:
def create_data_frame(fname):
    data = pd.read_table(fname, header=None, delim_whitespace=True,
                         names=["mpg", "cylinders", "displacement", "horsepower",
                                "weight", "acceleration", "model year", "origin", "car name"])
    return data

In [3]:
data = create_data_frame("auto-mpg.data")

## Contents of `auto-mpg.data`

Contents are listed as pairs of column names and the type of data in the column:

1. **mpg**:       continuous
2. __cylinders__:    multi-valued discrete
3. __displacement__:  continuous
4. __horsepower__:    continuous
5. __weight__:        continuous
6. __acceleration__:  continuous
7. __model year__:    multi-valued discrete
8. __origin__:        multi-valued discrete
9. __car name__:      string (unique for each instance)

There are 398 rows (instances), each with these 9 attributes. The horsepower column is also known to have 6 NaN values.

In [4]:
data.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
5,15.0,8,429.0,198.0,4341.0,10.0,70,1,ford galaxie 500
6,14.0,8,454.0,220.0,4354.0,9.0,70,1,chevrolet impala
7,14.0,8,440.0,215.0,4312.0,8.5,70,1,plymouth fury iii
8,14.0,8,455.0,225.0,4425.0,10.0,70,1,pontiac catalina
9,15.0,8,390.0,190.0,3850.0,8.5,70,1,amc ambassador dpl


To impute the NaN values in the _horsepower_ column, replacement with the average value will be used so that the data is not removed.

In [5]:
def clean_Nan(data):
    num_cols = data.shape[1]
    num_rows = data.shape[0]
    for col in range(num_cols-1):
        elem_list = []
        col_sum = 0
        num_items = 0
        for row in range(num_rows):
            if type(data.iloc[row, col]) is int or type(data.iloc[row, col]) is float:
                if np.isnan(data.iloc[row, col]):
                    elem_list.append((row, col))
                else:
                    col_sum += data.iloc[row, col]
                    num_items += 1
            elif type(data.iloc[row, col]) is str:
                try:
                    fdata = float(data.iloc[row, col])
                except ValueError:
                    fdata = np.nan
                if np.isnan(fdata):
                    elem_list.append((row, col))
                else:
                    data.iloc[row, col] = fdata
                    col_sum += data.iloc[row, col]
                    num_items += 1
        if num_items > 0:
            avg = col_sum / num_items
            for r, c in elem_list:
                data.iloc[r, c] = avg

In [6]:
clean_Nan(data)

In [7]:
def get_stats(data):
    # Makes a 8x8 array of statistics
    # Note: car names are excluded from this
    stats = np.empty([8,8])
    df = data.values[:,:-1]
    inds = np.asarray(np.where(df == '?'))
    for r, c in inds.T:
        df[r, c] = np.nan
    df = df.astype(float)
    stats[:,0] = np.mean(df, axis=0)
    stats[:,1] = np.std(df, axis=0)
    stats[:,2] = df.min(axis=0)
    stats[:,3] = df.max(axis=0)
    stats[:,4] = np.percentile(df, 25, axis=0)
    stats[:,5] = np.percentile(df, 50, axis=0)
    stats[:,6] = np.percentile(df, 75, axis=0)
    stats[:,7].fill(df.shape[0])
    stats = pd.DataFrame(stats, index=data.columns[:-1], columns=["Mean", "Std", "Min", "Max", "25 Percentile", "50 Percentile", "75 Percentile", "Num Elems"])
    return stats

In [8]:
stats = get_stats(data)

In [9]:
print(pd.DataFrame(stats))

                     Mean         Std     Min     Max  25 Percentile  \
mpg             23.514573    7.806159     9.0    46.6         17.500   
cylinders        5.454774    1.698866     3.0     8.0          4.000   
displacement   193.425879  104.138764    68.0   455.0        104.250   
horsepower     104.469388   38.151168    46.0   230.0         76.000   
weight        2970.424623  845.777234  1613.0  5140.0       2223.750   
acceleration    15.568090    2.754222     8.0    24.8         13.825   
model year      76.010050    3.692978    70.0    82.0         73.000   
origin           1.572864    0.801047     1.0     3.0          1.000   

              50 Percentile  75 Percentile  Num Elems  
mpg                    23.0         29.000      398.0  
cylinders               4.0          8.000      398.0  
displacement          148.5        262.000      398.0  
horsepower             95.0        125.000      398.0  
weight               2803.5       3608.000      398.0  
acceleration   

## Standardization

For standardization, two techniques will be used:

1. For continuous features, the standardized output will be the z-score of the input.
2. For discrete features, 

In [18]:
def standardize(data, stats):
    cont = ["displacement", "horsepower", "weight", "acceleration"]
    discr = ["cylinders", "model year", "origin"]
    for label in cont:
        data[label] = data[label].apply(lambda x: (x - stats.loc[label, "Mean"]) / stats.loc[label, "Std"])
    for label in discr:
        data[label] = data[label].apply(lambda x: int(x - stats.loc[label, "Min"]))

In [19]:
standardize(data, stats)
data

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,2,-1.846913,-2.720761,-3.511319,-6.122813,-70,-1,chevrolet chevelle malibu
1,15.0,2,-1.842948,-2.696714,-3.511055,-6.188726,-70,-1,buick skylark 320
2,18.0,2,-1.845899,-2.707020,-3.511414,-6.254639,-70,-1,plymouth satellite
3,16.0,2,-1.847190,-2.707020,-3.511418,-6.122813,-70,-1,amc rebel sst
4,17.0,2,-1.847374,-2.713890,-3.511396,-6.320553,-70,-1,ford torino
5,15.0,2,-1.835664,-2.674042,-3.510149,-6.386466,-70,-1,ford galaxie 500
6,14.0,2,-1.833359,-2.658927,-3.510131,-6.518292,-70,-1,chevrolet impala
7,14.0,2,-1.834650,-2.662362,-3.510190,-6.584205,-70,-1,plymouth fury iii
8,14.0,2,-1.833266,-2.655491,-3.510032,-6.386466,-70,-1,pontiac catalina
9,15.0,2,-1.839260,-2.679538,-3.510835,-6.584205,-70,-1,amc ambassador dpl


## Split Data into Training and Testing Sets

In [20]:
num_rows = data.shape[0]
div = num_rows // 4
train_max = 3 * div
train = data.iloc[0:train_max, :]
test = data.iloc[train_max:, :]

## Training

A standard linear regression algorithm is used. The equation for the weights is as follows:
$$
w = (X^{T}X)^{-1}X^{T}r
$$

In [21]:
X_train = train.loc[:, "cylinders":"origin"].values
r_train = train.loc[:, "mpg"].values
X_test = test.loc[:, "cylinders":"origin"].values
r_test = test.loc[:, "mpg"].values

In [22]:
def linreg_train(X, r):
    return np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T, X)), X.T), r)

In [23]:
weights = linreg_train(X_train, r_train)
weights

array([  -1.06487226, -137.55953539,  -98.40906613,  143.54528888,
         -3.63787264,    0.30011931,    1.07596217])

## Testing

Least Squares will be used as the main testing algorithm.

A basic percent error will also be used.

In [16]:
scores = []
for X, r in zip(X_test, r_test):
    y = np.dot(weights, X)
    scores.append((y-r)**2)
scores = np.array(scores)
score_sum = np.sum(scores)
lsquare_error = score_sum / 2
lsquare_error

2108.10348085218

In [17]:
scores = []
for X, r in zip(X_test, r_test):
    y = np.dot(weights, X)
    curr_error = abs((y - r) / r) * 100
    scores.append(curr_error)
scores = np.array(scores)
avg_percent_error = np.average(scores)
avg_percent_error

17.631581995658756