In [206]:
import pandas
import math

In [207]:
df = pandas.read_csv('house-prices.csv')
data = df.to_dict(orient='list')

## Extrac columns with missing values

### Create a `check_nan()` function

In [None]:
def is_nan(value) -> bool:
    return (value != value)

### Print the column names with missing values

In [None]:
def extract_column(data) -> list:
    result = []
    for feature in data.keys():
        for value in data[feature]:
            if is_nan(value):
                result.append(feature)
                break
    return result

print(extract_column(data))

## Count the number of lines with missing values

In [None]:
def count_line_with_missing_values(data) -> int:
    index_empty_row = set()
    for feature in data.keys():
        for index,value in enumerate(data[feature]):
            if is_nan(value):
                index_empty_row.add(index)
    return len(index_empty_row)

print(f"There are {count_line_with_missing_values(data)} lines with missing values")

## Fill in the missing value using mean, median (for numeric properties) and mode (for categorical properties)

### Divide features into numeric and categorical

In [None]:
def categorizing_feature(data) -> list:
    missing_feature = extract_column(data)
    cat_feature = []
    for feature in missing_feature:
        for value in data[feature]:
            if type(value) == str:
                cat_feature.append(feature)
                break
    num_feature = [feature for feature in missing_feature if feature not in cat_feature]
    return num_feature, cat_feature

categorizing_feature(data)

### Find the rows with missing values for each column

In [None]:
def index_empty_row(data, column) -> list:
    result = []
    for index in range(len(data[column])):
        if is_nan(data[column][index]):
            result.append(index)
    return result

### Fill in the missing values 

In [None]:
def fill_in(data, method, column) -> None:
  num_attr, cate_attr = categorizing_feature(data)
  empty_row = index_empty_row(data, column)
  if method == 'mean':
    if column not in num_attr:
      raise ValueError('Column must be numerical')
    else:
      for index in empty_row:
        data[column][index] = 0
      missing_value = sum(data[column]) / len(data[column])    
  elif method == 'median':
      if column not in num_attr:
        raise ValueError('Column must be numerical')
      else:
        not_empty_row = [data[column][i] for i in range(len(data[column])) if i not in empty_row]
        number_not_empty_row = len(not_empty_row)
        if number_not_empty_row % 2 == 0:
          missing_value = (not_empty_row[number_not_empty_row/2] + not_empty_row[number_not_empty_row/2 - 1])/2
        else:
          missing_value = not_empty_row[number_not_empty_row//2]
  elif method == 'mod':
    if column not in cate_attr:
      raise ValueError('Column must be categorical')
    else:
      value_dict = {}
      for index in range(len(data[column])):
        if index not in empty_row:
          if data[column][index] not in value_dict.keys():
            value_dict[data[column][index]] = 1
          else:
            value_dict[data[column][index]] += 1
      value_dict = sorted(value_dict.items(), key= lambda x: x[1], reverse=True)
      print(value_dict)
      missing_value = value_dict[0][0]
  else:
      raise ValueError('Method must be mean, median, or mode')
  # Fill missing value
  print(missing_value)
  for index in empty_row:
    data[column][index] = missing_value
    
    
fill_in(data, 'mod', 'MasVnrType')

## Deleting rows containing more than a particular number of missing values

In [168]:
def delete_row(data, limit_number) -> None:
    if not (1 <= limit_number <= len(data.keys())):
        raise ValueError('Invalid number of missing values')
    # Determine the indexes of missing rows in each column
    index_rows = []
    for feature in data.keys():
        index_rows += index_empty_row(data, feature)
    # Count the missing values in each row
    index_dict = {}
    for index in index_rows:
        if index not in index_dict.keys():
            index_dict[index] = 1
        else:
            index_dict[index] += 1
    # Sort this dictionary descending by keys, because the next step we will remove some row
    # And if we remove the rows at the end first, the indexes of the rest will not be affected
    sorted_index_dict = sorted(index_dict.items(), key= lambda x: x[0], reverse=True)
    # convert it back to dictionary
    index_dict = {item[0]: item[1] for item in sorted_index_dict}
    # Determine the row will be deleted
    for index in index_dict.keys():
        if index_dict[index] >= limit_number:
            for feature in data.keys():
                data[feature].pop(index)
    
delete_row(data, 15)

## Deleting columns containing more than a particular number of missing values

In [185]:
def delete_column(data, limit_number):
    # Determine the missing values for each column
    index_missing = {}
    for feature in data.keys():
        index_missing[feature] = []
        for i, value in enumerate(data[feature]):
            if is_nan(value):
                index_missing[feature].append(i)
    # Remove the column has the number of missing values greater than a particular number
    print(index_missing)
    for feature in index_missing.keys():
        if len(index_missing[feature]) >= limit_number:
            print(feature)
            data.pop(feature)
    
delete_column(data, 500)

{'Id': [], 'MSSubClass': [], 'MSZoning': [], 'LotFrontage': [4, 14, 44, 53, 58, 59, 65, 69, 87, 104, 106, 117, 118, 125, 130, 138, 140, 144, 149, 152, 158, 170, 174, 176, 181, 183, 186, 188, 192, 197, 204, 206, 215, 216, 222, 226, 235, 236, 242, 246, 250, 251, 252, 261, 264, 270, 271, 273, 276, 277, 306, 309, 311, 319, 329, 330, 335, 344, 362, 364, 365, 366, 370, 373, 414, 415, 426, 451, 452, 453, 455, 462, 465, 468, 472, 474, 477, 478, 485, 487, 489, 491, 492, 493, 510, 514, 520, 521, 534, 538, 546, 570, 574, 575, 576, 577, 579, 585, 589, 602, 609, 614, 615, 617, 625, 628, 633, 638, 644, 646, 652, 656, 661, 663, 664, 665, 671, 679, 680, 682, 686, 688, 690, 694, 696, 698, 705, 711, 732, 738, 740, 745, 746, 753, 766, 768, 771, 777, 786, 795, 803, 813, 823, 824, 830, 841, 843, 858, 868, 870, 875, 884, 886, 887, 894, 900, 907, 921, 923, 927, 928, 929, 935, 937, 940, 943, 955, 964, 965, 982, 990, 993, 996], 'LotArea': [], 'Street': [], 'Alley': [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 1

## Delete duplicate samples

In [201]:
def remove_duplicate(data):
    rows = []
    index_deleted_rows = []
    # Determine the index of duplicate row
    first_column = list(data.keys())[0]
    for i in range(len(data[first_column])):
        row = []
        for feature in data.keys():
            row.append(data[feature][i])
        if row not in rows:
            rows.append(row)
        else:
            index_deleted_rows.append(i)
    # Reverse the index list for removable purpose
    index_deleted_rows.reverse()
    # Remove the duplicate rows
    for index in index_deleted_rows:
        for feature in data.keys():
            data[feature].pop(index)

remove_duplicate(data)

## Normalize a numeric attribute using min-max and Z-score methods.

In [202]:
def mean_feature(feature):
    return sum(feature)/len(feature)

In [203]:
def std_feature(feature):
    mean = mean_feature(feature)
    total_deviation = sum([(value - mean)** 2 for value in feature])
    return math.sqrt(total_deviation/len(feature))

In [208]:
def normalize(data, column)

2.5
1.118033988749895
