In [2]:
import tensorflow.contrib.learn as skflow
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore


In [3]:
path = "./data/"
filename_read = os.path.join(path,"auto-mpg.csv")

In [8]:
ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))

def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

analyze(filename_read)


Analyzing: ./data/auto-mpg.csv
398 rows
** mpg:129 (32%)
** cylinders:[4:51.26%,8:25.88%,6:21.11%,3:1.01%,5:0.75%]
** displacement:[97.0:5.28%,98.0:4.52%,350.0:4.52%,250.0:4.27%,318.0:4.27%,140.0:4.02%,400.0:3.27%,225.0:3.27%,91.0:3.02%,232.0:2.76%,121.0:2.76%,302.0:2.76%,151.0:2.51%,120.0:2.26%,231.0:2.01%,200.0:2.01%,90.0:2.01%,85.0:2.01%,351.0:2.01%,304.0:1.76%,122.0:1.76%,105.0:1.76%,156.0:1.51%,79.0:1.51%,119.0:1.51%,108.0:1.26%,107.0:1.26%,89.0:1.26%,258.0:1.26%,135.0:1.26%,360.0:1.01%,86.0:1.01%,116.0:1.01%,112.0:1.01%,305.0:1.01%,134.0:1.01%,455.0:0.75%,307.0:0.75%,429.0:0.75%,173.0:0.75%,198.0:0.75%,168.0:0.75%,113.0:0.75%,260.0:0.75%,146.0:0.75%,70.0:0.75%,383.0:0.5%,71.0:0.5%,163.0:0.5%,262.0:0.5%,141.0:0.5%,199.0:0.5%,440.0:0.5%,104.0:0.25%,390.0:0.25%,454.0:0.25%,340.0:0.25%,110.0:0.25%,267.0:0.25%,88.0:0.25%,111.0:0.25%,144.0:0.25%,181.0:0.25%,145.0:0.25%,100.0:0.25%,81.0:0.25%,183.0:0.25%,131.0:0.25%,78.0:0.25%,80.0:0.25%,130.0:0.25%,72.0:0.25%,101.0:0.25%,115.0:0.25%,1

In [13]:
import pandas as pd
df = pd.read_csv(filename_read,na_values=['NA','?'])

# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)
    

# create feature vector
missing_median(df, 'horsepower')
df.drop('name',1,inplace=True)

df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,1
1,15.0,8,350.0,165.0,3693,11.5,70,1
2,18.0,8,318.0,150.0,3436,11.0,70,1
3,16.0,8,304.0,150.0,3433,12.0,70,1
4,17.0,8,302.0,140.0,3449,10.5,70,1
5,15.0,8,429.0,198.0,4341,10.0,70,1
6,14.0,8,454.0,220.0,4354,9.0,70,1
7,14.0,8,440.0,215.0,4312,8.5,70,1
8,14.0,8,455.0,225.0,4425,10.0,70,1
9,15.0,8,390.0,190.0,3850,8.5,70,1


In [18]:
# Encode a numeric column as zscores
def encode_numeric_zscore(df,name,mean=None,sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name]-mean)/sd

encode_numeric_zscore(df, 'horsepower')
encode_numeric_zscore(df, 'weight')
encode_numeric_zscore(df, 'acceleration')

df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,0.672271,0.630077,-1.293870,70,1
1,15.0,8,350.0,1.587959,0.853259,-1.475181,70,1
2,18.0,8,318.0,1.195522,0.549778,-1.656492,70,1
3,16.0,8,304.0,1.195522,0.546236,-1.293870,70,1
4,17.0,8,302.0,0.933897,0.565130,-1.837804,70,1
5,15.0,8,429.0,2.451322,1.618455,-2.019115,70,1
6,14.0,8,454.0,3.026898,1.633806,-2.381737,70,1
7,14.0,8,440.0,2.896085,1.584210,-2.563048,70,1
8,14.0,8,455.0,3.157710,1.717647,-2.019115,70,1
9,15.0,8,390.0,2.242022,1.038654,-2.563048,70,1


In [20]:
# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low =-1, normalized_high =1, 
                         data_low=None, data_high=None):
    
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])
    
    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
                * (normalized_high - normalized_low) + normalized_low

encode_numeric_range(df, 'cylinders',0,1)
encode_numeric_range(df, 'displacement',0,1)

df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,1.0,0.617571,0.672271,0.630077,-1.293870,70,1
1,15.0,1.0,0.728682,1.587959,0.853259,-1.475181,70,1
2,18.0,1.0,0.645995,1.195522,0.549778,-1.656492,70,1
3,16.0,1.0,0.609819,1.195522,0.546236,-1.293870,70,1
4,17.0,1.0,0.604651,0.933897,0.565130,-1.837804,70,1
5,15.0,1.0,0.932817,2.451322,1.618455,-2.019115,70,1
6,14.0,1.0,0.997416,3.026898,1.633806,-2.381737,70,1
7,14.0,1.0,0.961240,2.896085,1.584210,-2.563048,70,1
8,14.0,1.0,1.000000,3.157710,1.717647,-2.019115,70,1
9,15.0,1.0,0.832041,2.242022,1.038654,-2.563048,70,1
