In [1]:
import csv
import numpy as np
import random
import torch
import torch.utils.data

import pandas as pd

In [2]:
# Part 1 (a)
header = ['age', 'work', 'fnlwgt', 'edu', 'yredu', 'marriage', 'occupation',
 'relationship', 'race', 'sex', 'capgain', 'caploss', 'workhr', 'country','income']
df = pd.read_csv('adult.data', names = header)
print(df.shape[0])

32561


In [3]:
# Part 1 (b)
subdf = df[["age", "yredu", "capgain", "caploss", "workhr"]]
print('Min:\n' + str(np.min(subdf, axis = 0)))
print('\nMax:\n' + str(np.max(subdf, axis = 0)))
print('\nMean:\n' + str(np.mean(subdf, axis = 0)))

Min:
age        17
yredu       1
capgain     0
caploss     0
workhr      1
dtype: int64

Max:
age           90
yredu         16
capgain    99999
caploss     4356
workhr        99
dtype: int64

Mean:
age          38.581647
yredu        10.080679
capgain    1077.648844
caploss      87.303830
workhr       40.437456
dtype: float64


In [4]:
# Part 1 (c)
out = []
for i in range(subdf.shape[1]):
    cur_out = []
    cur_max = np.max(subdf.values[:,i])
    cur_min = np.min(subdf.values[:,i])
    for j in range(subdf.shape[0]):
        if (cur_max-cur_min) == 0:
            cur_out.append(0)
        else:
            cur_out.append((subdf.values[j,i]-cur_min)/(cur_max-cur_min))
    out.append(cur_out)
out = np.array(out).T
print(out)

[[0.30136986 0.8        0.02174022 0.         0.39795918]
 [0.45205479 0.8        0.         0.         0.12244898]
 [0.28767123 0.53333333 0.         0.         0.39795918]
 ...
 [0.56164384 0.53333333 0.         0.         0.39795918]
 [0.06849315 0.53333333 0.         0.         0.19387755]
 [0.47945205 0.53333333 0.1502415  0.         0.39795918]]


In [5]:
# Part 1 (d)
print(sum(df["sex"] == " Male"))
print(sum(df["sex"] == " Female"))

21790
10771


In [6]:
# Part 1 (e)
contcols = ["age", "yredu", "capgain", "caploss", "workhr"]
catcols = ["work", "marriage", "occupation", "edu", "relationship", "sex"]
features = contcols + catcols
df = df[features]

missing = pd.concat([df[c] == " ?" for c in catcols], axis=1).any(axis=1)
df_with_missing = df[missing]
df_not_missing = df[~missing]

In [7]:
print(df_with_missing.shape[0])
print(df_not_missing.shape[0])

1843
30718


In [8]:
# Part 1 (f)
data = pd.get_dummies(df_not_missing)
print('\n'.join(s for s in list(data) if 'work_' in s))

work_ Federal-gov
work_ Local-gov
work_ Private
work_ Self-emp-inc
work_ Self-emp-not-inc
work_ State-gov
work_ Without-pay


In [9]:
# Part 1 (g)
data.shape[1]
# During pd.get_dummies, string value columns are expanded into multiple
# columns each and converted into binary (0 or 1) values.
# This causes the increased number of columns (57).

57

In [10]:
# Part 1 (h)
datanp = data.values.astype(np.float32)

cat_index = {}  # Mapping of feature -> start index of feature in a record
cat_values = {} # Mapping of feature -> list of categorical values the feature can take

# build up the cat_index and cat_values dictionary
for i, header in enumerate(data.keys()):
    if "_" in header: # categorical header
        feature, value = header.split()
        feature = feature[:-1] # remove the last char; it is always an underscore
        if feature not in cat_index:
            cat_index[feature] = i
            cat_values[feature] = [value]
        else:
            cat_values[feature].append(value)

def get_onehot(record, feature):
    """
    Return the portion of `record` that is the one-hot encoding
    of feature. For example, since the feature "work" is stored
    in the indices [5:12] in each record, calling `get_range(record, "work")`
    is equivalent to accessing `record[5:12]`.
    
    Args:
        - record: a numpy array representing one record, formatted
                  the same way as a row in `data.np`
        - feature: a string, should be an element of `catcols`
    """
    start_index = cat_index[feature]
    stop_index = cat_index[feature] + len(cat_values[feature])
    return record[start_index:stop_index]

def get_categorical_value(onehot, feature):
    """
    Return the categorical value name of a feature given
    a one-hot vector representing the feature.
    
    Args:
        - onehot: a numpy array one-hot representation of the feature
        - feature: a string, should be an element of `catcols`
        
    Examples:
    
    >>> get_categorical_value(np.array([0., 0., 0., 0., 0., 1., 0.]), "work")
    'State-gov'
    >>> get_categorical_value(np.array([0.1, 0., 1.1, 0.2, 0., 1., 0.]), "work")
    'Private'
    """
    return cat_values[feature][onehot.argmax(axis=0)]

def get_feature(record, feature):
    """
    Return the categorical feature value of a record
    """
    onehot = get_onehot(record, feature)
    return get_categorical_value(onehot, feature)

def get_features(record):
    """
    Return a dictionary of all categorical feature values of a record
    """
    return { f: get_feature(record, f) for f in catcols }

In [11]:
print(cat_index)
print(cat_values)

{'work': 5, 'marriage': 12, 'occupation': 19, 'edu': 33, 'relationship': 49, 'sex': 55}
{'work': ['Federal-gov', 'Local-gov', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay'], 'marriage': ['Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'], 'occupation': ['Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct', 'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving'], 'edu': ['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Preschool', 'Prof-school', 'Some-college'], 'relationship': ['Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried', 'Wife'], 'sex': ['Female', 'Male']}


In [12]:
print(get_categorical_value(np.array([0., 0., 0., 0., 0., 1., 0.]), "work"))
print(get_categorical_value(np.array([0.1, 0., 1.1, 0.2, 0., 1., 0.]), "work"))

State-gov
Private


In [13]:
# Part 1 (i)
np.random.seed(50) # set the numpy seed for consistent split

np.random.shuffle(datanp)
training = datanp[0:int(len(datanp)*0.70)]
valid = datanp[int(len(datanp)*0.70):int(len(datanp)*0.85)]
test = datanp[int(len(datanp)*0.85):int(len(datanp)*1.00)]

In [14]:
print(training.shape)
print(valid.shape)
print(test.shape)

(21502, 57)
(4608, 57)
(4608, 57)


In [15]:
# Part 2
from torch import nn

class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(57, 57),
            nn.Sigmoid(),
            nn.Linear(20, 20)
        )
        self.decoder = nn.Sequential(
            nn.Linear(20,20),
            nn.Sigmoid(),
            nn.Linear(57, 57),
            nn.Sigmoid() # get to the range (0, 1)
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [16]:
# Part 3 (a) (b)
import matplotlib.pyplot as plt
%matplotlib inline
import os

def zero_out_feature(records, feature):
    """ Set the feature missing in records, by setting the appropriate
    columns of records to 0
    """
    start_index = cat_index[feature]
    stop_index = cat_index[feature] + len(cat_values[feature])
    records[:, start_index:stop_index] = 0
    return records

def zero_out_random_feature(records):
    """ Set one random feature missing in records, by setting the 
    appropriate columns of records to 0
    """
    return zero_out_feature(records, random.choice(catcols))

def get_accuracy(model, data_loader):
    """Return the "accuracy" of the autoencoder model across a data set
    
    Args:
       - model: the autoencoder model, an instance of nn.Module
       - data_loader: an instance of torch.utils.data.DataLoader

    Example (to illustrate how get_accuracy is intended to be called.
             depending on your variable naming this code might not work
             out of the box)

        >>> model = AutoEncoder()
        >>> vdl = torch.utils.data.DataLoader(data_valid, batch_size=256, shuffle=True)
        >>> get_accuracy(model, vdl)
    """
    total = 0
    acc = 0
    for col in catcols:
        for item in data_loader: # minibatches
            inp = item.detach().numpy()
            out = model(zero_out_feature(item.clone(), col)).detach().numpy()
            for i in range(out.shape[0]): # record in minibatch
                acc += int(get_feature(out[i], col) == get_feature(inp[i], col))
                total += 1
    return acc / total

def train(model, train_loader, valid_loader, num_epochs=5, learning_rate=1e-4):
    """ Training loop. You should update this."""
    torch.manual_seed(42)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    iters, viters, losses, vlosses, acc, vacc = [], [], [], [], [], []
    
    for epoch in range(num_epochs):
        cur_loss = 0
        n, vn = 0, 0
        for data in train_loader:
            #print('Epoch: '+str(epoch)+' Iter: '+str(n))
            
            datam = zero_out_random_feature(data.clone()) # zero out one categorical feature
            recon = model(datam)
            loss = criterion(recon, data)
            cur_loss += loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            n += 1
        
        #print('Accuracy calculated epoch: '+str(epoch))
        iters.append(epoch)
        losses.append(float(cur_loss)/train_loader.batch_size)             # compute *average* loss
        acc.append(get_accuracy(model, train_loader))
        
        cur_vloss = 0
        for vdata in valid_loader:
            
            vdatam = zero_out_random_feature(vdata.clone()) # zero out one categorical feature
            vrecon = model(vdatam)
            vloss = criterion(vrecon, vdata)
            cur_vloss += vloss
            
            vn += 1
        
        #print('vAccuracy calculated: '+str(epoch))
        
        viters.append(epoch)
        vlosses.append(float(cur_vloss)/valid_loader.batch_size)
        vacc.append(get_accuracy(model, valid_loader))
    
    # plotting
    plt.title("Training Curve")
    plt.plot(iters, losses, label="Train")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.show()
    
    plt.title("Validation Curve")
    plt.plot(viters, vlosses, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Loss")
    plt.show()
    
    plt.title("Training Curve")
    plt.plot(iters, acc, label="Train")
    plt.xlabel("Iterations")
    plt.ylabel("Acc")
    plt.show()
    
    plt.title("Validation Curve")
    plt.plot(viters, vacc, label="Validation")
    plt.xlabel("Iterations")
    plt.ylabel("Acc")
    plt.show()
    
    #print(iters)
    #print(viters)
    #print(losses)
    #print(vlosses)
    #print(acc)
    #print(vacc)
    
    return model
    
ae = AutoEncoder()
tl = torch.utils.data.DataLoader(training)
vl = torch.utils.data.DataLoader(valid)
train(ae, tl, vl, num_epochs=5)
# result plots problematic, no increasing trend

RuntimeError: size mismatch, m1: [1 x 57], m2: [20 x 20] at c:\a\w\1\s\tmp_conda_3.6_070023\conda\conda-bld\pytorch-cpu_1544079880394\work\aten\src\th\generic/THTensorMath.cpp:940

In [None]:
# Part 3 (c)
# increase batch_size for less results noise (vertical jitter)
# increase learning_rate since current LT maybe too low
ae = AutoEncoder()
tl = torch.utils.data.DataLoader(training, batch_size=1000)
vl = torch.utils.data.DataLoader(valid, batch_size=1000)
train(ae, tl, vl, num_epochs=5, learning_rate=0.01)
# increasing trend found, almost linear

In [None]:
# Part 3 (d)
# 1: increase num_epochs since the results' trend is good but needs more training
ae = AutoEncoder()
tl = torch.utils.data.DataLoader(training, batch_size=1000)
vl = torch.utils.data.DataLoader(valid, batch_size=1000)
train(ae, tl, vl, num_epochs=20, learning_rate=0.01)
# more conventional curves obtained, no clear plateau

In [None]:
# 2: further increase num_epochs since the increasing trend has not plateaued
ae = AutoEncoder()
tl = torch.utils.data.DataLoader(training, batch_size=1000)
vl = torch.utils.data.DataLoader(valid, batch_size=1000)
train(ae, tl, vl, num_epochs=40, learning_rate=0.01)
# conventional curves obtained, significant noise

In [None]:
# 3: increase batch_size to reduce results noise
ae = AutoEncoder()
tl = torch.utils.data.DataLoader(training, batch_size=2000)
vl = torch.utils.data.DataLoader(valid, batch_size=2000)
train(ae, tl, vl, num_epochs=40, learning_rate=0.01)
# noise reduced, clearer increasing trend

In [None]:
# 4: increase learning_rate to see if the results improve
ae = AutoEncoder()
tl = torch.utils.data.DataLoader(training, batch_size=2000)
vl = torch.utils.data.DataLoader(valid, batch_size=2000)
train(ae, tl, vl, num_epochs=40, learning_rate=0.05)
# results plateaued but worsened

In [None]:
# Part 4 (a)
ae = AutoEncoder()
tl = torch.utils.data.DataLoader(training, batch_size=2000)
vl = torch.utils.data.DataLoader(valid, batch_size=2000)
m = train(ae, tl, vl, num_epochs=31, learning_rate=0.01)

r = get_accuracy(ae, torch.utils.data.DataLoader(test, batch_size=2000))
print(r)

In [None]:
# Part 4 (b)
from collections import Counter

temp_df = df['marriage']
cou = Counter(temp_df)
n = cou.most_common(1)

n[0][1] / df.shape[0]

In [None]:
# Part 4 (c)
df
# Yes. As seen in the data, a person's education level is highly correlated
# to the number of years of education the person has.
# A person's occupation also seem to correlate to their education level.

In [None]:
# Part 4 (d)
from torch.autograd import Variable

o1 = get_onehot(test, 'edu')[0]
test = zero_out_feature(test, 'edu')
o2 = get_onehot(ae(Variable(torch.from_numpy(test))), 'edu')[0]

In [None]:
get_onehot(datanp[0], 'edu')

In [None]:
get_onehot(o2, 'edu')

In [None]:
cat_values # model predicted this person to be HS-grad (1.7371e-01) or
# Some-college (1.6761e-01)