In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
## load the data...
train_df = pd.read_csv('data/train_final.csv')
test_df = pd.read_csv('data/test_final.csv').iloc[:, 1:]

In [None]:
print(train_df.columns)
#print(train_df.loc[train_df['workclass'] =="?"])


In [None]:
import torch
import torch.nn as nn

In [None]:
from torch.utils.data import Dataset, DataLoader
class TDatasetTrain(Dataset):
    def __init__(self, path):
        train_df = pd.read_csv(path)
        X = train_df.iloc[:, :-1].copy()
        
        categorical_attrib = {"workclass": ["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", \
                                          "Local-gov", "State-gov", "Without-pay", "Never-worked"],
                             "education": ["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", \
                                         "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", \
                                         "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"],
                             "marital.status": ["Married-civ-spouse", "Divorced", "Never-married", "Separated",\
                                              "Widowed", "Married-spouse-absent", "Married-AF-spouse"],
                             "occupation": ["Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial",\
                                            "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", \
                                            "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv",\
                                            "Armed-Forces"],
                             "relationship": ["Wife", "Own-child", "Husband", "Not-in-family", "Other-relative",\
                                              "Unmarried"],
                             "race": ["White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"],
                             "sex": ["Female", "Male"],
                             "native.country":  ["United-States", "Cambodia", "England", "Puerto-Rico", \
                                              "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", \
                                              "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras",\
                                              "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico",\
                                              "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", \
                                              "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", \
                                              "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", \
                                              "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"]}
        
        self.max_id_dict = {}
        for key, value in categorical_attrib.items():
            for idx, arrtib in enumerate(value):
                X[key] = X[key].replace(arrtib, idx+1)
            max_id = X[key].value_counts().idxmax()
            self.max_id_dict[key] = max_id
            X[key] = X[key].replace("?", max_id)
        X = pd.get_dummies(X)
        
        for key, value in categorical_attrib.items():
            for arrtib in value:
                if arrtib not in X.columns:
                    X[key+"_"+arrtib] = np.zeros((X.shape[0]))
        
        X = X.apply(pd.to_numeric)
        self.max_dict = {}
        max_dict = {}
        for attrib in X.columns:
            max_dict[attrib] = [max(X[attrib]), min(X[attrib])]
            max_, min_ = max_dict[attrib]
            range_ = max_ - min_
            range_ = 1 if range_ == 0 else range_
            X[attrib]  = (X[attrib]-min_)/range_
        self.max_dict = max_dict

        
        self.X = X.values
        self.Y = train_df.iloc[:, -1].values
        
    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]).float(),  torch.tensor([self.Y[idx]]).float()
    
class TDatasetTest(Dataset):
    def __init__(self, path, max_list, max_id_dict):
        train_df = pd.read_csv(path)
        X = train_df.iloc[:, 1:].copy()
        #print(X.columns)
        categorical_attrib = {"workclass": ["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", \
                                          "Local-gov", "State-gov", "Without-pay", "Never-worked"],
                             "education": ["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", \
                                         "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", \
                                         "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"],
                             "marital.status": ["Married-civ-spouse", "Divorced", "Never-married", "Separated",\
                                              "Widowed", "Married-spouse-absent", "Married-AF-spouse"],
                             "occupation": ["Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial",\
                                            "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", \
                                            "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv",\
                                            "Armed-Forces"],
                             "relationship": ["Wife", "Own-child", "Husband", "Not-in-family", "Other-relative",\
                                              "Unmarried"],
                             "race": ["White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"],
                             "sex": ["Female", "Male"],
                             "native.country":  ["United-States", "Cambodia", "England", "Puerto-Rico", \
                                              "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", \
                                              "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras",\
                                              "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico",\
                                              "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", \
                                              "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", \
                                              "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", \
                                              "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"]}
        
        for key, value in categorical_attrib.items():
            for arrtib in value:
                if arrtib not in X.columns:
                    X[key+"_"+arrtib] = np.zeros((X.shape[0]))
                    
        for key, value in categorical_attrib.items():
            for idx, arrtib in enumerate(value):
                X[key] = X[key].replace(arrtib, idx+1)            
                X[key] = X[key].replace("?", max_id_dict[key])
                    
        X = pd.get_dummies(X)
        #print(X.columns)
        X = X.apply(pd.to_numeric)
        #print(max_id_dict)

            
            
        # X = X.apply(pd.to_numeric)
        for attrib in X.columns:
            max_, min_ = max_list[attrib]
            range_ = max_ - min_
            range_ = 1 if range_ == 0 else range_
            X[attrib]  = (X[attrib]-max_list[attrib][1])/range_
                    
        #print(X["native.country"])
        self.X = X.values

        self.Y = np.zeros((train_df.shape[0]))
        
    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]).float(),  torch.tensor([self.Y[idx]]).float()

#test_data = TDatasetTest("data/test_final.csv", train_data.max_dict, train_data.max_id_dict)


In [None]:
from torch.utils.data import Dataset, DataLoader
class LDatasetTrain(Dataset):
    def __init__(self, path, max_list=None):
        train_df = pd.read_csv(path)
        X = train_df.iloc[:, :-1].copy()
        categorical_attrib = {"workclass": ["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", \
                                          "Local-gov", "State-gov", "Without-pay", "Never-worked"],
                             "education": ["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", \
                                         "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", \
                                         "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"],
                             "marital.status": ["Married-civ-spouse", "Divorced", "Never-married", "Separated",\
                                              "Widowed", "Married-spouse-absent", "Married-AF-spouse"],
                             "occupation": ["Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial",\
                                            "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", \
                                            "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv",\
                                            "Armed-Forces"],
                             "relationship": ["Wife", "Own-child", "Husband", "Not-in-family", "Other-relative",\
                                              "Unmarried"],
                             "race": ["White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"],
                             "sex": ["Female", "Male"],
                             "native.country":  ["United-States", "Cambodia", "England", "Puerto-Rico", \
                                              "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", \
                                              "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras",\
                                              "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico",\
                                              "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", \
                                              "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", \
                                              "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", \
                                              "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"]}
        
        self.max_id_dict = {}
        for key, value in categorical_attrib.items():
            for idx, arrtib in enumerate(value):
                X[key] = X[key].replace(arrtib, idx+1)
            max_id = X[key].value_counts().idxmax()
            self.max_id_dict[key] = max_id
            X[key] = X[key].replace("?", max_id)
            
            
        X = X.apply(pd.to_numeric)
        self.max_dict = {}
        max_dict = {}
        for attrib in X.columns:
            max_dict[attrib] = [max(X[attrib]), min(X[attrib])]
            max_, min_ = max_dict[attrib]
            #print(max_, min_)
            range_ = max_ - min_
            range_ = 1 if range_ == 0 else range_
            X[attrib]  = (X[attrib]-min_)/range_
        self.max_dict = max_dict

                    
        self.X = X.values
        self.Y = train_df.iloc[:, -1].values

        
    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]).float(),  torch.tensor([self.Y[idx]]).float()
    
from torch.utils.data import Dataset, DataLoader
class LDatasetTest(Dataset):
    def __init__(self, path, max_list, max_id_dict):
        train_df = pd.read_csv(path)

        X = train_df.iloc[:, 1:].copy()
        categorical_attrib = {"workclass": ["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", \
                                          "Local-gov", "State-gov", "Without-pay", "Never-worked"],
                             "education": ["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", \
                                         "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", \
                                         "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"],
                             "marital.status": ["Married-civ-spouse", "Divorced", "Never-married", "Separated",\
                                              "Widowed", "Married-spouse-absent", "Married-AF-spouse"],
                             "occupation": ["Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial",\
                                            "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", \
                                            "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv",\
                                            "Armed-Forces"],
                             "relationship": ["Wife", "Own-child", "Husband", "Not-in-family", "Other-relative",\
                                              "Unmarried"],
                             "race": ["White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"],
                             "sex": ["Female", "Male"],
                             "native.country":  ["United-States", "Cambodia", "England", "Puerto-Rico", \
                                              "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", \
                                              "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras",\
                                              "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico",\
                                              "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", \
                                              "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", \
                                              "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", \
                                              "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"]}
        
        for key, value in categorical_attrib.items():
            for idx, arrtib in enumerate(value):
                X[key] = X[key].replace(arrtib, idx+1)            
            X[key] = X[key].replace("?", max_id_dict[key])
            
            
        X = X.apply(pd.to_numeric)
        for attrib in X.columns:
            range_ = max_list[attrib][0]-max_list[attrib][1]
            range_ = 1 if range_ == 0 else range_
            X[attrib]  = (X[attrib]-max_list[attrib][1])/range_
                    
        #print(X["native.country"])
        self.X = X.values

        self.Y = np.zeros((train_df.shape[0]))
        
    def __len__(self):
        return len(self.Y)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]).float(),  torch.tensor([self.Y[idx]]).float()

In [None]:
def weight_init(module, initf):
    def foo(m):
        classname = m.__class__.__name__.lower()
        if isinstance(m, module):
            initf(m.weight)
    return foo 

class NeuralNtwrk(nn.Module):
    def __init__(self, depth, width, ilen, activation=nn.ReLU(), init_wt=nn.init.xavier_normal_):
        super().__init__()
        self.layers = nn.ModuleList()
        self.activation_fn = activation
        self.initfn = init_wt

        layer_zero = nn.Sequential(
                nn.Linear(ilen, width),
                self.activation_fn,
            )
        self.layers.append(layer_zero)
        
        for i in range(depth):
            layer = nn.Sequential(
                nn.Linear(width, width),
                self.activation_fn,
            )
            self.layers.append(layer)
        
        ## Final layer
        self.layers.append(nn.Linear(width, 1))
        
        self.apply(weight_init(module=nn.Linear, initf=self.initfn))
        
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [None]:
from tqdm import tqdm

train_data = LDatasetTrain("data/train_final.csv")
train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
#print(train_data.max_dict)
test_data = LDatasetTest("data/test_final.csv", train_data.max_dict, train_data.max_id_dict)
test_loader = DataLoader(test_data, batch_size=1)

model = NeuralNtwrk(depth=5, width=40, ilen=train_data.X.shape[1], activation=nn.PReLU(), init_wt=nn.init.kaiming_normal_)
criterion = nn.MSELoss()
optim = torch.optim.Adam(model.parameters(), 1e-4, betas=(0.9, 0.999))
#optim = torch.optim.SGD(model.parameters(), 1e-4)
optim.zero_grad()
        
for epoch in tqdm(range(100), desc="Epochs: "):
    loss_list = []
    for x, y in train_loader:
        optim.zero_grad()
        y_out = model(x)
        loss  = criterion(y_out, y)
        loss.backward()
        optim.step()
        loss_list.append(loss.detach().squeeze())
    tqdm.write('Epoch - '+str(epoch)+": MSE="+str(np.mean(np.array(loss_list))))
    
tqdm.write('Running Test')
y_pred = []
y_pred_C = []
for x, y in test_loader:
    y_out = model(x).squeeze().detach().numpy()
    y_pred_C.append(y_out)
    y_out = 1 if y_out > 0.5 else 0 
    y_pred.append(y_out)
    
out_results = pd.DataFrame({"ID":np.arange(1, len(y_pred)+1, dtype=int),"Prediction":y_pred})
out_results.to_csv("submissions/submit_nn1_a.csv", index=False)
out_results = pd.DataFrame({"ID":np.arange(1, len(y_pred_C)+1, dtype=int),"Prediction":y_pred_C})
out_results.to_csv("submissions/submit_nn1_b.csv", index=False)

In [None]:
from tqdm import tqdm

train_data = TDatasetTrain("data/train_final.csv")
train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
#print(train_data.max_dict)
test_data = TDatasetTest("data/test_final.csv", train_data.max_dict, train_data.max_id_dict)
test_loader = DataLoader(test_data, batch_size=1)

model = NeuralNtwrk(depth=3, width=120, ilen=train_data.X.shape[1], activation=nn.ReLU(), init_wt=nn.init.xavier_normal_)
criterion = nn.MSELoss()
optim = torch.optim.Adam(model.parameters(), 1e-4, betas=(0.9, 0.999))
optim.zero_grad()
        
for epoch in tqdm(range(100), desc="Epochs: "):
    loss_list = []
    for x, y in train_loader:
        optim.zero_grad()
        y_out = model(x)
        loss  = criterion(y_out, y)
        loss.backward()
        optim.step()
        loss_list.append(loss.detach().squeeze())
    tqdm.write('Epoch - '+str(epoch)+": MSE="+str(np.mean(np.array(loss_list))))
    
    
tqdm.write('Running Test')
y_pred = []
y_pred_C = []
for x, y in test_loader:
    y_out = model(x).squeeze().detach().numpy()
    y_pred_C.append(y_out)
    y_out = 1 if y_out > 0.5 else 0 
    y_pred.append(y_out)
    
out_results = pd.DataFrame({"ID":np.arange(1, len(y_pred)+1, dtype=int),"Prediction":y_pred})
out_results.to_csv("submissions/submit_nn2_a.csv", index=False)
out_results = pd.DataFrame({"ID":np.arange(1, len(y_pred_C)+1, dtype=int),"Prediction":y_pred_C})
out_results.to_csv("submissions/submit_nn2_b.csv", index=False)

In [None]:
## SVM Classification
train_df = pd.read_csv('data/train_final.csv')
test_df = pd.read_csv('data/test_final.csv').iloc[:, 1:]
train_df_new = train_df.iloc[:, :-1].copy()
train_df_new["age"] = pd.cut(train_df_new["age"],bins=[0,2,17,65,99], labels=["baby","Child","Adult","Elderly"])
train_df_new["fnlwgt"] = pd.cut(train_df_new["fnlwgt"],bins=[-1,np.median(train_df["fnlwgt"]), np.max(train_df["fnlwgt"])], labels=["0","1"])
train_df_new["education.num"] = pd.cut(train_df_new["education.num"],bins=[-1,np.median(train_df["education.num"]), np.max(train_df["education.num"])], labels=["0","1"])
train_df_new["capital.gain"] = pd.cut(train_df_new["capital.gain"],bins=[-1,1,np.max(train_df["capital.gain"])], labels=["0", "1"])
train_df_new["capital.loss"] = pd.cut(train_df_new["capital.loss"],bins=[-1,1, np.max(train_df["capital.loss"])], labels=["0", "1"])
train_df_new["hours.per.week"] = pd.cut(train_df_new["hours.per.week"],bins=[-1,20,40,60,np.max(train_df["hours.per.week"])], labels=["part","full","over","nosleep"])
#print(train_df_new)
test_df_new = test_df.copy()
test_df_new["age"] = pd.cut(test_df_new["age"],bins=[0,2,17,65,99], labels=["baby","Child","Adult","Elderly"])
test_df_new["fnlwgt"] = pd.cut(test_df_new["fnlwgt"],bins=[-1,np.median(train_df["fnlwgt"]), np.max(train_df["fnlwgt"])], labels=["0","1"])
test_df_new["education.num"] = pd.cut(test_df_new["education.num"],bins=[-1,np.median(train_df["education.num"]), np.max(train_df["education.num"])], labels=["0","1"])
test_df_new["capital.gain"] = pd.cut(test_df_new["capital.gain"],bins=[-1,1,np.max(train_df["capital.gain"])], labels=["0", "1"])
test_df_new["capital.loss"] = pd.cut(test_df_new["capital.loss"],bins=[-1,1, np.max(train_df["capital.loss"])], labels=["0", "1"])
test_df_new["hours.per.week"] = pd.cut(test_df_new["hours.per.week"],bins=[-1,20,40,60,np.max(train_df["hours.per.week"])], labels=["part","full","over","nosleep"])
#print(test_df_new)

train_df_new_oh = pd.get_dummies(train_df_new)
test_df_new_oh = pd.get_dummies(test_df_new)

native_country = ["United-States", "Cambodia", "England", "Puerto-Rico", \
                  "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", \
                  "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras",\
                  "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico",\
                  "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", \
                  "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", \
                  "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", \
                  "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"]

for nc in native_country:
    if nc not in train_df_new_oh.columns:
        train_df_new_oh["native.country_"+nc] = np.zeros((train_df_new_oh.shape[0]))
    if nc not in test_df_new_oh.columns:
        test_df_new_oh["native.country_"+nc] = np.zeros((test_df_new_oh.shape[0]))

X_train = train_df_new_oh.values
y_train = train_df.iloc[:, -1].values
X_test = test_df_new_oh.values

from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
out_results = pd.DataFrame({"ID":np.arange(1, len(y_pred)+1, dtype=int),"Prediction":y_pred})
out_results.to_csv("submit_svc.csv", index=False)

In [None]:
## SVM Regression with max occurence for missing data
train_df = pd.read_csv('data/train_final.csv')
test_df = pd.read_csv('data/test_final.csv').iloc[:, 1:]
train_df_new = train_df.iloc[:, :-1].copy()
test_df_new = test_df.copy()

categorical_attrib = {"workclass": ["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", \
                                          "Local-gov", "State-gov", "Without-pay", "Never-worked"],
                        "education": ["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", \
                                    "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", \
                                    "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"],
                        "marital.status": ["Married-civ-spouse", "Divorced", "Never-married", "Separated",\
                                        "Widowed", "Married-spouse-absent", "Married-AF-spouse"],
                        "occupation": ["Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial",\
                                    "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", \
                                    "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv",\
                                    "Armed-Forces"],
                        "relationship": ["Wife", "Own-child", "Husband", "Not-in-family", "Other-relative",\
                                        "Unmarried"],
                        "race": ["White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"],
                        "sex": ["Female", "Male"],
                        "native.country":  ["United-States", "Cambodia", "England", "Puerto-Rico", \
                                        "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", \
                                        "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras",\
                                        "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico",\
                                        "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", \
                                        "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", \
                                        "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", \
                                        "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"]}

## Take care of missing attributes
for key, value in categorical_attrib.items():
    max_id = train_df_new[key].value_counts().idxmax()
    train_df_new[key] = train_df_new[key].replace("?", max_id)
    test_df_new[key] = test_df_new[key].replace("?", max_id)

train_df_new_oh = pd.get_dummies(train_df_new)
test_df_new_oh = pd.get_dummies(test_df_new)

for nc in categorical_attrib['native.country']:
    if nc not in train_df_new_oh.columns:
        train_df_new_oh["native.country_"+nc] = np.zeros((train_df_new_oh.shape[0]))
    if nc not in test_df_new_oh.columns:
        test_df_new_oh["native.country_"+nc] = np.zeros((test_df_new_oh.shape[0]))

X_train = train_df_new_oh.values
y_train = train_df.iloc[:, -1].values
X_test = test_df_new_oh.values

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
regr = make_pipeline(StandardScaler(), SVR(C=2.0, epsilon=0.2))
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
y_pred = np.where(y_pred<0.5,0,1)
out_results = pd.DataFrame({"ID":np.arange(1, len(y_pred)+1, dtype=int),"Prediction":y_pred})
out_results.to_csv("submissions/submit_svr1.csv", index=False)
from sklearn.metrics import accuracy_score
y_pred_train = regr.predict(X_train)
y_pred_train = np.where(y_pred_train<0.5,0,1)
print("Train accuracy: ", accuracy_score(y_train, y_pred_train))

In [None]:
## SVM Regression with max occurence for specific output in missing data
train_df = pd.read_csv('data/train_final.csv')
test_df = pd.read_csv('data/test_final.csv').iloc[:, 1:]

categorical_attrib = {"workclass": ["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", \
                                          "Local-gov", "State-gov", "Without-pay", "Never-worked"],
                        "education": ["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", \
                                    "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", \
                                    "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"],
                        "marital.status": ["Married-civ-spouse", "Divorced", "Never-married", "Separated",\
                                        "Widowed", "Married-spouse-absent", "Married-AF-spouse"],
                        "occupation": ["Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial",\
                                    "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", \
                                    "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv",\
                                    "Armed-Forces"],
                        "relationship": ["Wife", "Own-child", "Husband", "Not-in-family", "Other-relative",\
                                        "Unmarried"],
                        "race": ["White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"],
                        "sex": ["Female", "Male"],
                        "native.country":  ["United-States", "Cambodia", "England", "Puerto-Rico", \
                                        "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", \
                                        "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras",\
                                        "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico",\
                                        "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", \
                                        "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", \
                                        "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", \
                                        "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"]}

## Take care of missing attributes
for index, row in train_df.iterrows():
    t_ = train_df.loc[train_df['income>50K'] == row['income>50K']]
    for key, value in categorical_attrib.items():
        max_id = t_[key].value_counts().idxmax()
        if train_df.at[index, key] == "?":
            train_df.at[index, key]=max_id 
        if index < test_df.shape[0]:
            if test_df.at[index, key] == "?":
                test_df.at[index, key]=max_id 

train_df_new = train_df.iloc[:, :-1].copy()
test_df_new = test_df.copy()
train_df_new_oh = pd.get_dummies(train_df_new)
test_df_new_oh = pd.get_dummies(test_df_new)


for nc in categorical_attrib['native.country']:
    if nc not in train_df_new_oh.columns:
        train_df_new_oh["native.country_"+nc] = np.zeros((train_df_new_oh.shape[0]))
    if nc not in test_df_new_oh.columns:
        test_df_new_oh["native.country_"+nc] = np.zeros((test_df_new_oh.shape[0]))
        
max_dict = {}
for attrib in train_df_new_oh.columns:
    max_dict[attrib] = max(train_df_new_oh[attrib])
    max_dict[attrib] = 1 if max_dict[attrib] == 0 else max_dict[attrib]
    train_df_new_oh[attrib]  = train_df_new_oh[attrib]/max_dict[attrib]
    
for attrib in test_df_new_oh.columns:
    test_df_new_oh[attrib]  = test_df_new_oh[attrib]/max_dict[attrib]

X_train = train_df_new_oh.values
y_train = train_df.iloc[:, -1].values
X_test = test_df_new_oh.values

print("Preprocessing Done")
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
regr = make_pipeline(StandardScaler(), SVR(C=2.0, epsilon=0.2))
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
y_pred = np.where(y_pred<=0.5,0,1)
out_results = pd.DataFrame({"ID":np.arange(1, len(y_pred)+1, dtype=int),"Prediction":y_pred})
out_results.to_csv("submissions/submit_svr2.csv", index=False)
from sklearn.metrics import accuracy_score
y_pred_train = regr.predict(X_train)
y_pred_train = np.where(y_pred_train<=0.5,0,1)
print("Train accuracy: ", accuracy_score(y_train, y_pred_train))

In [None]:
## SVM Regression with all instances for missing data
train_df = pd.read_csv('data/train_final.csv')
test_df = pd.read_csv('data/test_final.csv').iloc[:, 1:]


categorical_attrib = {"workclass": ["Private", "Self-emp-not-inc", "Self-emp-inc", "Federal-gov", \
                                          "Local-gov", "State-gov", "Without-pay", "Never-worked"],
                        "education": ["Bachelors", "Some-college", "11th", "HS-grad", "Prof-school", \
                                    "Assoc-acdm", "Assoc-voc", "9th", "7th-8th", "12th", "Masters", \
                                    "1st-4th", "10th", "Doctorate", "5th-6th", "Preschool"],
                        "marital.status": ["Married-civ-spouse", "Divorced", "Never-married", "Separated",\
                                        "Widowed", "Married-spouse-absent", "Married-AF-spouse"],
                        "occupation": ["Tech-support", "Craft-repair", "Other-service", "Sales", "Exec-managerial",\
                                    "Prof-specialty", "Handlers-cleaners", "Machine-op-inspct", "Adm-clerical", \
                                    "Farming-fishing", "Transport-moving", "Priv-house-serv", "Protective-serv",\
                                    "Armed-Forces"],
                        "relationship": ["Wife", "Own-child", "Husband", "Not-in-family", "Other-relative",\
                                        "Unmarried"],
                        "race": ["White", "Asian-Pac-Islander", "Amer-Indian-Eskimo", "Other", "Black"],
                        "sex": ["Female", "Male"],
                        "native.country":  ["United-States", "Cambodia", "England", "Puerto-Rico", \
                                        "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", \
                                        "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras",\
                                        "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico",\
                                        "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", \
                                        "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", \
                                        "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", \
                                        "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"]}

## Take care of missing attributes
for index, row in train_df.iterrows():
    row_ = row.copy()
    for key, value in categorical_attrib.items():
        if row[key] == '?':
            for v in value:
                row_[key] = v
                train_df.append(row_)
                
for key, value in categorical_attrib.items():   
   train_df.drop(train_df[train_df[key]=='?'].index, inplace=True)
            
for index, row in test_df.iterrows():
    t_ = train_df.loc[train_df['income>50K'] == row['income>50K']]
    for key, value in categorical_attrib.items():
        max_id = t_[key].value_counts().idxmax()
        if test_df.at[index, key] == "?":
            test_df.at[index, key]=max_id 

train_df_new = train_df.iloc[:, :-1].copy()
test_df_new = test_df.copy()

train_df_new_oh = pd.get_dummies(train_df_new)
test_df_new_oh = pd.get_dummies(test_df_new)

for nc in categorical_attrib['native.country']:
    if nc not in train_df_new_oh.columns:
        train_df_new_oh["native.country_"+nc] = np.zeros((train_df_new_oh.shape[0]))
    if nc not in test_df_new_oh.columns:
        test_df_new_oh["native.country_"+nc] = np.zeros((test_df_new_oh.shape[0]))

X_train = train_df_new_oh.values
y_train = train_df.iloc[:, -1].values
X_test = test_df_new_oh.values

print("Preprocessing Done")
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.1))
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
y_pred = np.where(y_pred<=0.5,0,1)
out_results = pd.DataFrame({"ID":np.arange(1, len(y_pred)+1, dtype=int),"Prediction":y_pred})
out_results.to_csv("submissions/submit_svr3.csv", index=False)
from sklearn.metrics import accuracy_score
y_pred_train = regr.predict(X_train)
y_pred_train = np.where(y_pred_train<=0.5,0,1)
print("Train accuracy: ", accuracy_score(y_train, y_pred_train))

In [48]:
## SVM Regression
train_df = pd.read_csv('data/train_final.csv')
test_df = pd.read_csv('data/test_final.csv').iloc[:, 1:]
train_df_new = train_df.iloc[:, :-1].copy()
test_df_new = test_df.copy()

train_df_new_oh = pd.get_dummies(train_df_new)
test_df_new_oh = pd.get_dummies(test_df_new)

native_country = ["United-States", "Cambodia", "England", "Puerto-Rico", \
                  "Canada", "Germany", "Outlying-US(Guam-USVI-etc)", "India", \
                  "Japan", "Greece", "South", "China", "Cuba", "Iran", "Honduras",\
                  "Philippines", "Italy", "Poland", "Jamaica", "Vietnam", "Mexico",\
                  "Portugal", "Ireland", "France", "Dominican-Republic", "Laos", \
                  "Ecuador", "Taiwan", "Haiti", "Columbia", "Hungary", "Guatemala", \
                  "Nicaragua", "Scotland", "Thailand", "Yugoslavia", "El-Salvador", \
                  "Trinadad&Tobago", "Peru", "Hong", "Holand-Netherlands"]

for nc in native_country:
    if nc not in train_df_new_oh.columns:
        train_df_new_oh["native.country_"+nc] = np.zeros((train_df_new_oh.shape[0]))
    if nc not in test_df_new_oh.columns:
        test_df_new_oh["native.country_"+nc] = np.zeros((test_df_new_oh.shape[0]))

X_train = train_df_new_oh.values
y_train = train_df.iloc[:, -1].values
X_test = test_df_new_oh.values

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
regr = make_pipeline(StandardScaler(), SVR(kernel='poly', degree=5, gamma='auto', C=1.5, epsilon=0.3))
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
y_pred = np.where(y_pred<0.5,0,1)
out_results = pd.DataFrame({"ID":np.arange(1, len(y_pred)+1, dtype=int),"Prediction":y_pred})
out_results.to_csv("submit_svr_poly.csv", index=False)