In [1]:
"""Implements feature extraction and data processing helpers.
"""


import numpy as np


def preprocess_data(dataset,
                    feature_columns=[
                        'Id', 'BldgType', 'OverallQual'
                        'GrLivArea', 'GarageArea'
                    ],
                    squared_features=False,
                    ):
    """Processes the dataset into vector representation.

    When converting the BldgType to a vector, use one-hot encoding, the order
    has been provided in the one_hot_bldg_type helper function. Otherwise,
    the values in the column can be directly used.

    If squared_features is true, then the feature values should be
    element-wise squared.

    Args:
        dataset(dict): Dataset extracted from io_tools.read_dataset
        feature_columns(list): List of feature names.
        squred_features(bool): Whether to square the features.

    Returns:
        processed_datas(list): List of numpy arrays x, y.
            x is a numpy array, of dimension (N,K), N is the number of example
            in the dataset, and K is the length of the feature vector.
            Note: BldgType when converted to one hot vector is of length 5.
            Each row of x contains an example.
            y is a numpy array, of dimension (N,1) containing the SalePrice.
    """
    columns_to_id = {'Id': 0, 'BldgType': 1, 'OverallQual': 2,
                     'GrLivArea': 3, 'GarageArea': 4, 'SalePrice': 5}
    feature_converted = [columns_to_id[feature_columns[i]] for i in range(len(feature_columns))]

    x = list()
    y = list()
    #row_len = int(np.sum([ 5 if feature_columns[i] =='BldgType' else 1 for i in range(len(feature_columns))]))
    
    for k, v in dataset.items():
        row = list()
        if 'BldgType' in feature_columns:
            row.extend(one_hot_bldg_type(v[columns_to_id['BldgType']]))
        if squared_features:
            row.extend([float(v[i])**2 for i in range(len(v)) if (i != columns_to_id['BldgType'] and i in feature_converted)])
        else:
            row.extend([float(v[i]) for i in range(len(v)) if (i != columns_to_id['BldgType'] and i in feature_converted)])
        if row_len != len(row):
            print("Sth wrong with data processing!")
        x.append(row)
        y.append([v[5]])
    x = np.array(x)
    y = np.array(y)
    processed_dataset = [x, y]
    return processed_dataset


def one_hot_bldg_type(bldg_type):
    """Builds the one-hot encoding vector.

    Args:
        bldg_type(str): String indicating the building type.

    Returns:
        ret(list): A list representing the one-hot encoding vector.
            (e.g. for 1Fam building type, the returned list should be
            [1,0,0,0,0].
    """
    type_to_id = {'1Fam': 0,
                  '2FmCon': 1,
                  'Duplx': 2,
                  'TwnhsE': 3,
                  'TwnhsI': 4,
                  }
    ret =list([0, 0, 0, 0, 0])
    for k, v in type_to_id.items():
        if bldg_type==v:
            ret[v] = 1
            
    #pass
    return ret


In [1]:
"""Implements feature extraction and data processing helpers.
"""


import numpy as np
def one_hot_bldg_type(bldg_type):
    """Builds the one-hot encoding vector.

    Args:
        bldg_type(str): String indicating the building type.

    Returns:
        ret(list): A list representing the one-hot encoding vector.
            (e.g. for 1Fam building type, the returned list should be
            [1,0,0,0,0].
    """
    type_to_id = {'1Fam': 0,
                  '2FmCon': 1,
                  'Duplx': 2,
                  'TwnhsE': 3,
                  'TwnhsI': 4,
                  }
    ret = [0, 0, 0, 0, 0]
    ret[type_to_id[bldg_type]]=1
            
    #pass
    return ret

def preprocess_data(dataset,
                    feature_columns=[
                        'Id', 'BldgType', 'OverallQual'
                        'GrLivArea', 'GarageArea'
                    ],
                    squared_features=False,
                    ):
    """Processes the dataset into vector representation.

    When converting the BldgType to a vector, use one-hot encoding, the order
    has been provided in the one_hot_bldg_type helper function. Otherwise,
    the values in the column can be directly used.

    If squared_features is true, then the feature values should be
    element-wise squared.

    Args:
        dataset(dict): Dataset extracted from io_tools.read_dataset
        feature_columns(list): List of feature names.
        squred_features(bool): Whether to square the features.

    Returns:
        processed_datas(list): List of numpy arrays x, y.
            x is a numpy array, of dimension (N,K), N is the number of example
            in the dataset, and K is the length of the feature vector.
            Note: BldgType when converted to one hot vector is of length 5.
            Each row of x contains an example.
            y is a numpy array, of dimension (N,1) containing the SalePrice.
    """
    columns_to_id = {'Id': 0, 'BldgType': 1, 'OverallQual': 2,
                     'GrLivArea': 3, 'GarageArea': 4, 'SalePrice': 5}
    feature_converted = [columns_to_id[feature_columns[i]] for i in range(len(feature_columns))]

    x = list()
    y = list()
    #row_len = int(np.sum([ 5 if feature_columns[i] =='BldgType' else 1 for i in range(len(feature_columns))]))
    
    for k, v in dataset.items():
        row = list()
        if 'BldgType' in feature_columns:
            row.extend(one_hot_bldg_type(v[int(columns_to_id['BldgType'])]))
        if squared_features:
            row.extend([float(v[i])**2 for i in range(len(v)) if (i != columns_to_id['BldgType'] and i in feature_converted)])
        else:
            row.extend([float(v[i]) for i in range(len(v)) if (i != columns_to_id['BldgType'] and i in feature_converted)])
        if row_len != len(row):
            print("Sth wrong with data processing!")
        x.append(row)
        y.append([float(v[5])])
    x = np.array(x)
    y = np.array(y)
    processed_dataset = [x, y]
    return processed_dataset




SyntaxError: invalid syntax (<ipython-input-1-d0c223e72a49>, line 70)