In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
class standardScaler():
    def fit(self, xss):
        self.mean = np.mean(xss, axis=0)
        self.sd = np.std(xss, axis=0)

    def transform(self, xss):
        xss = (xss-self.mean)/(self.sd)
        return(xss)

In [6]:
def sigmoid(predict):
    return(1/(1 + np.exp(-predict)))

def loss(y, yhat):
    return -np.mean(y*np.log(yhat)) + (1-y)*np.log(1-yhat)

In [7]:
class Scaler():
    def fit(self, xss, dummy_columns, numeric_columns):
        
        self.numeric_columns = numeric_columns
        self.dummy_columns = dummy_columns
        
        ## process numeric matrix
        numeric_vector = xss.iloc[:,numeric_columns]
        self.mean = np.mean(numeric_vector)
        self.std = np.std(numeric_vector)
        
        ## process dummy matrix
        dummy_vector = xss.iloc[:,dummy_columns]
        self.proportion = np.mean(dummy_vector)
        
    def transform_dummy(self,xs,proportion):
        trans_xs = [1 - proportion if x == 1 else proportion for x in xs]
        return trans_xs
        
    def transform_numeric(self,xs, mean, std):
        xs = (xs-mean)/std
        return(xs)

    def transform(self, xss):
        row, col = xss.shape
        df = []
        for c in range(col):
            xs = xss.iloc[:,c]
            if c in self.dummy_columns:
                dff = xs
            else:
                idx = self.numeric_columns.index(c)
                mean = self.mean[idx]
                std = self.std[idx]
                dff = self.transform_numeric(xs,mean,std)
                
            df.append(dff)
        df = (np.column_stack(df))
        return(pd.DataFrame(df))

In [None]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2
# from sklearn.preprocessing import MinMaxScaler

# X_train = pd.DataFrame(X_train)
# X_norm = X_train

# # X_norm = MinMaxScaler().fit_transform(X_train)
# chi_selector = SelectKBest(chi2, k=89)
# chi_selector.fit(X_norm, Y_train)
# chi_support = chi_selector.get_support()
# chi_feature = X_train.loc[:,chi_support].columns.tolist()
# X_train = X_train[chi_feature]
# print(str(len(chi_feature)), 'selected features')

In [12]:
def get_dummy_id(X_train):
    df = X_train.describe().T
    row, col = df.shape
    dummy_columns = []
    numeric_columns = []
    for r in range(row):
        if df['max'][r] <= 1 and df['min'][r] >= 0:
            dummy_columns.append(r)
        else:
            numeric_columns.append(r)
    
    return({'dummy_columns': dummy_columns,
            'numeric_columns': numeric_columns})

In [13]:
from sklearn.preprocessing import FunctionTransformer
data = pd.read_csv('data/train.csv')

X_train = pd.get_dummies(data.iloc[:,:-1], dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)
Y_train = pd.get_dummies(data.iloc[:,-1], dummy_na=False, columns=None, sparse=False, drop_first=True, dtype=None)
# X_train = X_train.drop(missing_cols, axis=1)
X_train_col = X_train.columns

res = get_dummy_id(X_train)
dummy_columns = res['dummy_columns']
numeric_columns = res['numeric_columns']

scaler = Scaler()
scaler.fit(X_train, dummy_columns, numeric_columns)
X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)


# ## tranform the data
# transformer = FunctionTransformer(np.log1p, validate=True)
# X_train = transformer.transform(X_train)
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)

In [14]:
from IPython.display import clear_output

def gradientDescent(xss, yss, lr = 1, iteration = 200, alpha = 0):
    max_iter =  10 ** 3
    epochs =  30
    
    ## get bias
    xss = np.column_stack(([1] * len(xss) ,xss))
        
    num = xss.shape[1]
    w = np.zeros(num)
    w_lr = np.zeros(num)

    for t in range(epochs):
        w_grad = None
        for m in range(max_iter):
            predict = np.dot(xss,w)
            predict = sigmoid(predict)
            w_grad = np.dot(xss.T, predict-yss)
            w -= lr * w_grad / len(w)
    
        predict = np.array([0  if predict[i] < 0.5 else 1 for i in range(len(predict))])
        accur = np.mean(predict == yss)
        clear_output()

        print(t)
        print(predict)
        print(accur)
    res = {'w': w, 'accuracy': accur}
    return (res)

In [None]:
from sklearn.model_selection import train_test_split

his_train = []
his_test = []
features = []
w_train = []
lr_list = np.logspace(-6,-2, 5)

x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.25, random_state=0)
for lr in lr_list:
    res = gradientDescent(x_train,y_train.values.ravel(),lr, 300)
    w = res['w']
    accur = res['accuracy']
    w_train.append(w)
    his_train.append(accur)
    
    predict = np.dot(np.column_stack(([1] * len(x_test) , x_test)),w)
    predict = sigmoid(predict)
    predict = np.array([0  if predict[i] < 0.5 else 1 for i in range(len(predict))])
    accur = np.mean(predict == y_test.values.ravel())
    his_test.append(accur)
    print(his_test)

In [15]:
res = gradientDescent(X_train,Y_train.values.ravel(),10**-2, 30)
w = res['w']

29
[0 0 0 ... 0 0 1]
0.8533828813611376


In [25]:
outputFile = '0311-3'

In [33]:
X_test = pd.read_csv('data/test.csv')
X_test = pd.get_dummies(X_test, dummy_na=False, columns=None, sparse=False, drop_first=True, dtype=None)

missing_cols = set(X_train_col ) - set(X_test.columns)
print(missing_cols)

# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
    
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train_col]
X_test = np.array(X_test)
# ## tranform the data
# transformer = FunctionTransformer(np.log1p, validate=True)
# X_test = transformer.transform(X_test)
# X_test = scaler.transform(X_test)

{'relationship_ Husband', 'native_country_ Holand-Netherlands', 'sex_ Female', 'marital_status_ Divorced', 'occupation_ ?', 'native_country_ ?', 'workclass_ ?', 'race_ Amer-Indian-Eskimo', 'education_ 10th'}


In [34]:
## save model
np.save('model/'+ outputFile + '.npy',{'w': w, 'scaler': scaler})

## output testing data
row , col = X_test.shape
ans = []
for i in range((row)):
    x = X_test[i]
    x = x.reshape(-1,1)
#     x = transformer.transform(x)
#     x = MinMaxScaler().fit_transform(x)

    x = np.concatenate(([[1]] ,x)).ravel()
    predict = np.dot(x,w)
    predict = sigmoid(predict)
    predict = 0 if predict < 0.5 else 1
    val = predict
    ans.append([i+1 ,val])

ans = pd.DataFrame(ans,columns=['id', 'label'])
ans.to_csv('data/'+outputFile+'.csv', index=False)

NameError: name 'transformer' is not defined

In [24]:
lab3 = pd.read_csv('data/0311-1.csv')['label']
lab2 = pd.read_csv('data/0310-3.csv')['label']
# print(np.sum(lab1))
print(np.sum(lab2))
print(np.sum(lab3))

FileNotFoundError: File b'data/0310-3.csv' does not exist

In [None]:
outputFile = 'vote'
total_lab = (lab1 + lab2 + lab3).values
vote = [0 if total_lab[i] <= 2 else 1 for i in total_lab]
ans.to_csv('data/'+outputFile+'.csv', index=False)

In [None]:
# print(X_train.columns)