In [48]:
import pandas

In [49]:
df = pandas.read_csv('house-prices.csv')
data = df.to_dict(orient='list')

## Extrac columns with missing values

### Create a `check_nan()` function

In [50]:
def is_nan(value) -> bool:
    return (value != value)

### Print the column names with missing values

In [51]:
def extract_column(data) -> list:
    result = []
    for feature in data.keys():
        for value in data[feature]:
            if is_nan(value):
                result.append(feature)
                break
    return result

print(extract_column(data))

['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


## Count the number of lines with missing values

In [52]:
def count_line_with_missing_values(data) -> int:
    index_empty_row = set()
    for feature in data.keys():
        for index,value in enumerate(data[feature]):
            if is_nan(value):
                index_empty_row.add(index)
    return len(index_empty_row)

print(f"There are {count_line_with_missing_values(data)} lines with missing values")

There are 1000 lines with missing values


## Fill in the missing value using mean, median (for numeric properties) and mode (for categorical properties)

### Divide features into numeric and categorical

In [53]:
def categorizing_feature(data) -> list:
    num_feature = [feature for feature in extract_column(data)]
    cat_feature = []
    for feature in num_feature:
        for value in data[feature]:
            if type(value) == str:
                cat_feature.append(feature)
                num_feature.remove(feature)
                break
    return num_feature, cat_feature
   

### Find the rows with missing values for each column

In [54]:
def index_empty_row(data, column) -> list:
    result = []
    for index in range(len(data[column])):
        if is_nan(data[column][index]):
            result.append(index)
    return result

### Fill in the missing values 

In [55]:
def fill_in(data, method, column) -> None:
  num_attr, cate_attr = categorizing_feature(data)
  empty_row = index_empty_row(data, column)
  print(num_attr)
  if method == 'mean':
    if column not in num_attr:
      raise ValueError('Column must be numerical')
    else:
      for index in empty_row:
        data[column][index] = 0
      missing_value = sum(data[column]) / len(data[column])    
  elif method == 'median':
      if column not in num_attr:
        raise ValueError('Column must be numerical')
      else:
        not_empty_row = [data[column][i] for i in range(len(data[column])) if i not in empty_row]
        number_not_empty_row = len(not_empty_row)
        if number_not_empty_row % 2 == 0:
          missing_value = (not_empty_row[number_not_empty_row/2] + not_empty_row[number_not_empty_row/2 - 1])/2
        else:
          missing_value = not_empty_row[number_not_empty_row//2]
  elif method == 'mod':
    if column not in cate_attr:
      raise ValueError('Column must be categorical')
    else:
      pass
  else:
      raise ValueError('Method must be mean, median, or mode')
  # Fill missing value
  print(missing_value)
  for index in empty_row:
    data[column][index] = missing_value
    
    
fill_in(data, 'median', 'LotFrontage')

['LotFrontage', 'MasVnrType', 'MasVnrArea', 'BsmtCond', 'BsmtFinType1', 'FireplaceQu', 'GarageYrBlt', 'GarageQual', 'PoolQC', 'MiscFeature']
63.0
