In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load file containing missing data
df = pd.read_csv('incomplete_test.csv')
df

Unnamed: 0,area,price,category,year,condition
0,Lapland,124000,apartment,1972.0,2.0
1,Lapland,89000,apartment,1984.0,3.0
2,Central,187500,house,2005.0,
3,South,256000,house,,5.0
4,Lapland,156000,house,2011.0,5.0
5,South,176000,apartment,1997.0,
6,Central,367400,house,,5.0
7,Central,166000,apartment,1981.0,3.0
8,South,249000,apartment,2004.0,4.0


In [3]:
# Quick and easy way to remove missing value data completely -->
# drop the row even if one value is NaN
df = df.dropna()

In [4]:
# Original index is messed up because orginal idex are used
df

Unnamed: 0,area,price,category,year,condition
0,Lapland,124000,apartment,1972.0,2.0
1,Lapland,89000,apartment,1984.0,3.0
4,Lapland,156000,house,2011.0,5.0
7,Central,166000,apartment,1981.0,3.0
8,South,249000,apartment,2004.0,4.0


In [5]:
# Fix the index, old index is still as a column
df = df.reset_index()

In [6]:
# Remove old index column
df = df.drop('index', axis=1)

In [7]:
# All good
df

Unnamed: 0,area,price,category,year,condition
0,Lapland,124000,apartment,1972.0,2.0
1,Lapland,89000,apartment,1984.0,3.0
2,Lapland,156000,house,2011.0,5.0
3,Central,166000,apartment,1981.0,3.0
4,South,249000,apartment,2004.0,4.0


<h3>Start over</h3>

In [8]:
# Load file containing missing data
df = pd.read_csv('incomplete_test.csv')
df

Unnamed: 0,area,price,category,year,condition
0,Lapland,124000,apartment,1972.0,2.0
1,Lapland,89000,apartment,1984.0,3.0
2,Central,187500,house,2005.0,
3,South,256000,house,,5.0
4,Lapland,156000,house,2011.0,5.0
5,South,176000,apartment,1997.0,
6,Central,367400,house,,5.0
7,Central,166000,apartment,1981.0,3.0
8,South,249000,apartment,2004.0,4.0


In [9]:
# lets assume the condition is between 1-5
# so default value could be for example, 3?
df['condition'].fillna(3, inplace=True)
df

Unnamed: 0,area,price,category,year,condition
0,Lapland,124000,apartment,1972.0,2.0
1,Lapland,89000,apartment,1984.0,3.0
2,Central,187500,house,2005.0,3.0
3,South,256000,house,,5.0
4,Lapland,156000,house,2011.0,5.0
5,South,176000,apartment,1997.0,3.0
6,Central,367400,house,,5.0
7,Central,166000,apartment,1981.0,3.0
8,South,249000,apartment,2004.0,4.0


In [10]:
# fix the year with the average year of the dataset
df['year'].fillna(df['year'].mean(), inplace=True)

In [11]:
df

Unnamed: 0,area,price,category,year,condition
0,Lapland,124000,apartment,1972.0,2.0
1,Lapland,89000,apartment,1984.0,3.0
2,Central,187500,house,2005.0,3.0
3,South,256000,house,1993.428571,5.0
4,Lapland,156000,house,2011.0,5.0
5,South,176000,apartment,1997.0,3.0
6,Central,367400,house,1993.428571,5.0
7,Central,166000,apartment,1981.0,3.0
8,South,249000,apartment,2004.0,4.0


In [12]:
# Modify year as interger
df['year'] = df['year'].astype(int)
df

Unnamed: 0,area,price,category,year,condition
0,Lapland,124000,apartment,1972,2.0
1,Lapland,89000,apartment,1984,3.0
2,Central,187500,house,2005,3.0
3,South,256000,house,1993,5.0
4,Lapland,156000,house,2011,5.0
5,South,176000,apartment,1997,3.0
6,Central,367400,house,1993,5.0
7,Central,166000,apartment,1981,3.0
8,South,249000,apartment,2004,4.0


In [13]:
# get unique areas in the data
# useful if lots of data with many options
df['area'].unique()

array(['Lapland', 'Central', 'South'], dtype=object)

In [14]:
# Number of unique values
df['area'].nunique()

3