# <center> Testing on Titanic Dataset </center>

In [1]:
# Basic libraries
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

# datawig
import datawig

In [2]:
# Import data
df = pickle.load(open("titanic_df.p","rb"))
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Survived  891 non-null    category
 1   Pclass    891 non-null    category
 2   Name      891 non-null    object  
 3   Sex       891 non-null    category
 4   Age       714 non-null    float64 
 5   SibSp     891 non-null    int64   
 6   Parch     891 non-null    int64   
 7   Ticket    891 non-null    object  
 8   Fare      891 non-null    float64 
 9   Cabin     204 non-null    object  
 10  Embarked  889 non-null    category
dtypes: category(4), float64(2), int64(2), object(3)
memory usage: 59.6+ KB


In [4]:
df.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [5]:
# Percentage of missingness per column
df.isna().sum()/len(df)*100

2022-08-05 20:37:11,468 [INFO]  Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-08-05 20:37:11,468 [INFO]  NumExpr defaulting to 8 threads.


Survived     0.000000
Pclass       0.000000
Name         0.000000
Sex          0.000000
Age         19.865320
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Fare         0.000000
Cabin       77.104377
Embarked     0.224467
dtype: float64

# Testing SimpleImputer.complete() function

### Test 1
Running the `SimpleImputer.complete()` function on current data types returns an error.

In [6]:
# Returns error
df_complete = datawig.SimpleImputer.complete(df, precision_threshold = 0.0)
df_complete.isna().sum()

ValueError: fill value must be in categories

### Test 2 
Running `SimpleImputer.complete()` without "category" data types actually imputes all columns expect the categorical Cabin. Embarked was imputed successfully probably due to the low missingness in the column.

In [7]:
# Convert category types 
df[["Survived","Pclass"]] = df[["Survived","Pclass"]].astype("int")
df[["Sex","Embarked"]] = df[["Sex","Embarked"]].astype("object")

In [8]:
df_complete = datawig.SimpleImputer.complete(df, precision_threshold = 0.0)
df_complete.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      0
dtype: int64

Imputing with a higher precision_threshold shows the same behaviour (at least on the surface).

In [9]:
df_complete = datawig.SimpleImputer.complete(df, precision_threshold = 0.8)
df_complete.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      0
dtype: int64

### Test 3
Running `SimpleImputer.complete()` on a dataset full of object columns did not impute any values, except in Embarked which missed only two values.

In [10]:
# Prevent ValueError by forcing columns to string type (this converts NaN values to "nan")
for col in df:
   df[col] = df[col].astype(str)

# Convert "nan" back to np.nan
for col in df:
  if df[col].str.contains("nan").any():
    df[col].replace("nan", np.nan, inplace=True)

In [11]:
df_complete = datawig.SimpleImputer.complete(df, precision_threshold = 0.0)
df_complete.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      0
dtype: int64

# Testing custom imputation models (categorical columns only)
(Kernel restart)

### Test 1
Running a custom imputation model with all columns as inputs. Category types were converted.

In [3]:
# Convert category types 
df[["Survived","Pclass"]] = df[["Survived","Pclass"]].astype("int")
df[["Sex","Embarked"]] = df[["Sex","Embarked"]].astype("object")

In [4]:
# Split data for SimpleImputer 
df_train, df_test = datawig.utils.random_split(df)

In [5]:
# Custom imputation model for Cabin 
imputer_Cabin = datawig.SimpleImputer(
    input_columns=['Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Age', 'Embarked'], 
    output_column= 'Cabin', 
    output_path = 'imputer_model'
    )

# Fit model and predict on test set
imputer_Cabin.fit(train_df=df_train, num_epochs=50)
predictions = imputer_Cabin.predict(df_test)

# Calculate metrics - datawig provides its own metrics for categorical variables
metrics = imputer_Cabin.load_metrics()
weighted_f1 = metrics['weighted_f1']
avg_precision = metrics['avg_precision']
print("weighted_f1 :", weighted_f1, "\n", "avg_precision :", avg_precision)

2022-08-05 20:51:07,731 [INFO]  CategoricalEncoder for column Cabin                                found only 45 occurrences of value C
2022-08-05 20:51:07,732 [INFO]  CategoricalEncoder for column Cabin                                found only 38 occurrences of value B
2022-08-05 20:51:07,732 [INFO]  CategoricalEncoder for column Cabin                                found only 23 occurrences of value Other
2022-08-05 20:51:07,733 [INFO]  CategoricalEncoder for column Cabin                                found only 23 occurrences of value E
2022-08-05 20:51:07,733 [INFO]  CategoricalEncoder for column Cabin                                found only 22 occurrences of value D
2022-08-05 20:51:07,756 [INFO]  Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-08-05 20:51:07,756 [INFO]  NumExpr defaulting to 8 threads.
2022-08-05 20:51:07,789 [INFO]  
2022-08-05 20:51:07,814 [INFO]  Epoch[0] Batch [0-5]	Speed: 4196.77 samples/sec	cross-ent

weighted_f1 : 0.45333333333333337 
 avg_precision : 0.3833333333333333


### Test 2
Running a custom imputation model with all columns as inputs. All columns were converted to type object.

In [6]:
# Prevent ValueError by forcing columns to string type (this converts NaN values to "nan")
for col in df:
   df[col] = df[col].astype(str)

# Convert "nan" back to np.nan
for col in df:
  if df[col].str.contains("nan").any():
    df[col].replace("nan", np.nan, inplace=True)

In [7]:
# Split data for SimpleImputer 
df_train, df_test = datawig.utils.random_split(df)

In [8]:
# Custom imputation model for Cabin 
imputer_Cabin = datawig.SimpleImputer(
    input_columns=['Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Age', 'Embarked'], 
    output_column= 'Cabin', 
    output_path = 'imputer_model'
    )

# Fit model and predict on test set
imputer_Cabin.fit(train_df=df_train, num_epochs=50)
predictions = imputer_Cabin.predict(df_test)

# Calculate metrics - datawig provides its own metrics for categorical variables
metrics = imputer_Cabin.load_metrics()
weighted_f1 = metrics['weighted_f1']
avg_precision = metrics['avg_precision']
print("weighted_f1 :", weighted_f1, "\n", "avg_precision :", avg_precision)

weighted_f1 : 0.4126984126984127 
 avg_precision : 0.365


# <center> Testing on Melbourne Housing Market Dataset </center>
(Kernel restart)

In [1]:
# Basic libraries
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

# datawig
import datawig

In [2]:
# Import data
df = pd.read_csv("melb_data.csv", usecols = [0, *range(2,7), 8, *range(10,21)])
pd.set_option('display.max_columns', 500)
df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,3,h,850000.0,PI,Biggin,2.5,3.0,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,4,h,1600000.0,VB,Nelson,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Rooms          13580 non-null  int64  
 2   Type           13580 non-null  object 
 3   Price          13580 non-null  float64
 4   Method         13580 non-null  object 
 5   SellerG        13580 non-null  object 
 6   Distance       13580 non-null  float64
 7   Bedroom2       13580 non-null  float64
 8   Bathroom       13580 non-null  float64
 9   Car            13518 non-null  float64
 10  Landsize       13580 non-null  float64
 11  BuildingArea   7130 non-null   float64
 12  YearBuilt      8205 non-null   float64
 13  CouncilArea    12211 non-null  object 
 14  Lattitude      13580 non-null  float64
 15  Longtitude     13580 non-null  float64
 16  Regionname     13580 non-null  object 
 17  Propertycount  13580 non-null  float64
dtypes: flo

In [4]:
df.isna().sum()

Suburb              0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Distance            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [5]:
# Percentage of missingness per column
df.isna().sum()/len(df)*100

2022-08-05 20:51:35,193 [INFO]  Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-08-05 20:51:35,193 [INFO]  NumExpr defaulting to 8 threads.


Suburb            0.000000
Rooms             0.000000
Type              0.000000
Price             0.000000
Method            0.000000
SellerG           0.000000
Distance          0.000000
Bedroom2          0.000000
Bathroom          0.000000
Car               0.456554
Landsize          0.000000
BuildingArea     47.496318
YearBuilt        39.580265
CouncilArea      10.081001
Lattitude         0.000000
Longtitude        0.000000
Regionname        0.000000
Propertycount     0.000000
dtype: float64

### Test 1
Running `SimpleImputer.complete()` on current datatypes (no category types) imputes all columns except the categorical one.

In [6]:
df_complete = datawig.SimpleImputer.complete(df, precision_threshold = 0.0)
df_complete.isna().sum()

Suburb              0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Distance            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
YearBuilt           0
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

### Test 2
Increasing precision_threshold shows no difference.

In [7]:
df_complete = datawig.SimpleImputer.complete(df, precision_threshold = 0.8)
df_complete.isna().sum()

Suburb              0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Distance            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
YearBuilt           0
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

### Test 3
Running `SimpleImputer.complete()` on a dataset full of object columns (numerics converted) did not impute any values.

In [12]:
for col in df:
   df[col] = df[col].astype(str)

# Convert "nan" back to np.nan
for col in df:
  if df[col].str.contains("nan").any():
    df[col].replace("nan", np.nan, inplace=True)

In [13]:
df_complete = datawig.SimpleImputer.complete(df, precision_threshold = 0.0)
df_complete.isna().sum()

Suburb              0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Distance            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

# Testing custom imputation models (categorical columns only)
(Kernel restart)

### Test 1
Running a custom imputation model with all columns as inputs.

In [3]:
# Split data for SimpleImputer 
df_train, df_test = datawig.utils.random_split(df)

In [4]:
# Custom imputation model for CouncilArea 
imputer_CouncilArea = datawig.SimpleImputer(
    input_columns=['Suburb', 'Rooms', 'Type', 'Price', 'Method', 'SellerG', 'Distance',
                   'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt',
                   'Lattitude', 'Longtitude', 'Regionname','Propertycount'], 
    output_column= 'CouncilArea', 
    output_path = 'imputer_model'
    )

# Fit model and predict on test set
imputer_CouncilArea.fit(train_df=df_train, num_epochs=50)
predictions = imputer_CouncilArea.predict(df_test)

# Calculate metrics - datawig provides its own metrics for categorical variables
metrics = imputer_CouncilArea.load_metrics()
weighted_f1 = metrics['weighted_f1']
avg_precision = metrics['avg_precision']
print("weighted_f1 :", weighted_f1, "\n", "avg_precision :", avg_precision)

2022-08-05 22:40:43,303 [INFO]  CategoricalEncoder for column CouncilArea                                found only 63 occurrences of value Knox
2022-08-05 22:40:43,304 [INFO]  CategoricalEncoder for column CouncilArea                                found only 52 occurrences of value Wyndham
2022-08-05 22:40:43,304 [INFO]  CategoricalEncoder for column CouncilArea                                found only 52 occurrences of value Maroondah
2022-08-05 22:40:43,304 [INFO]  CategoricalEncoder for column CouncilArea                                found only 46 occurrences of value Melton
2022-08-05 22:40:43,305 [INFO]  CategoricalEncoder for column CouncilArea                                found only 37 occurrences of value Greater Dandenong
2022-08-05 22:40:43,305 [INFO]  CategoricalEncoder for column CouncilArea                                found only 36 occurrences of value Frankston
2022-08-05 22:40:43,305 [INFO]  CategoricalEncoder for column CouncilArea                             

2022-08-05 22:41:34,342 [INFO]  Epoch[10] Train-cross-entropy=0.044840
2022-08-05 22:41:34,342 [INFO]  Epoch[10] Train-CouncilArea-accuracy=0.985087
2022-08-05 22:41:34,343 [INFO]  Epoch[10] Time cost=4.453
2022-08-05 22:41:34,349 [INFO]  Saved checkpoint to "imputer_model/model-0010.params"
2022-08-05 22:41:34,535 [INFO]  Epoch[10] Validation-cross-entropy=0.111105
2022-08-05 22:41:34,536 [INFO]  Epoch[10] Validation-CouncilArea-accuracy=0.974385
2022-08-05 22:41:36,792 [INFO]  Epoch[11] Batch [0-275]	Speed: 1958.36 samples/sec	cross-entropy=0.037764	CouncilArea-accuracy=0.986639
2022-08-05 22:41:39,011 [INFO]  Epoch[11] Train-cross-entropy=0.037502
2022-08-05 22:41:39,012 [INFO]  Epoch[11] Train-CouncilArea-accuracy=0.986794
2022-08-05 22:41:39,012 [INFO]  Epoch[11] Time cost=4.476
2022-08-05 22:41:39,017 [INFO]  Saved checkpoint to "imputer_model/model-0011.params"
2022-08-05 22:41:39,206 [INFO]  Epoch[11] Validation-cross-entropy=0.095480
2022-08-05 22:41:39,207 [INFO]  Epoch[11] V

weighted_f1 : 0.9784770440916246 
 avg_precision : 0.9574437467473303


### Test 2
Running a custom imputation model with all columns as inputs. All columns were converted to type object.

In [8]:
for col in df:
   df[col] = df[col].astype(str)

# Convert "nan" back to np.nan
for col in df:
  if df[col].str.contains("nan").any():
    df[col].replace("nan", np.nan, inplace=True)

In [9]:
# Split data for SimpleImputer 
df_train, df_test = datawig.utils.random_split(df)

In [10]:
# Custom imputation model for CouncilArea 
imputer_CouncilArea = datawig.SimpleImputer(
    input_columns=['Suburb', 'Rooms', 'Type', 'Price', 'Method', 'SellerG', 'Distance',
                   'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt',
                   'Lattitude', 'Longtitude', 'Regionname','Propertycount'], 
    output_column= 'CouncilArea', 
    output_path = 'imputer_model'
    )

# Fit model and predict on test set
imputer_CouncilArea.fit(train_df=df_train, num_epochs=50)
predictions = imputer_CouncilArea.predict(df_test)

# Calculate metrics - datawig provides its own metrics for categorical variables
metrics = imputer_CouncilArea.load_metrics()
weighted_f1 = metrics['weighted_f1']
avg_precision = metrics['avg_precision']
print("weighted_f1 :", weighted_f1, "\n", "avg_precision :", avg_precision)

weighted_f1 : 0.9810940777061792 
 avg_precision : 0.9363669560504246


# Conclusions: 
- `SimpleImputer.complete()` will return an error if columns with type category exist in the dataset
- `SimpleImputer.complete()` will not work properly if numeric columns are converted to type object
- `SimpleImputer.complete()` will not impute categorical columns unless missingness is very low
<br>
For custom imputation models, converting numeric columns to type object showed inconsistent results. Sometimes precision was higher and sometimes lower, but the difference was small. Finally, as with any imputation method, the higher the column missingness the worst the results.
<br>

How datawig behaves with datetime columns? That's a problem for future Homer. Man I don't envy that guy.
