In [50]:
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 100)

DATA_PATH = Path.cwd().parent / "data_mining_project" 

from sklearn.impute import MissingIndicator
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

RANDOM_SEED = 6    # Set a random seed for reproducibility!

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
import eli5
from sklearn.preprocessing import PolynomialFeatures
import math

In [65]:
def formatFeatures(dataFrame, datatype = 0):
    df = dataFrame
    all_features = df.columns.values
    df.dropna(axis=0, thresh=1, inplace=True)
    
    df.replace({999.0: np.NaN}, inplace=True)
    indicator = MissingIndicator(missing_values=np.NaN)
    indicator = indicator.fit_transform(df)
    indicator = pd.DataFrame(indicator,columns=["ind"+str(i) for i in range(30)], index=dataFrame.index)
    
    imp_numerical = SimpleImputer(strategy='median',missing_values=np.nan)
    imp_catagorical = SimpleImputer(strategy='constant',missing_values=np.nan)
    
    df[df.columns[df.dtypes != "object"].values] = imp_numerical.fit_transform(df[df.columns[df.dtypes != "object"].values])
    df[df.columns[df.dtypes == "object"].values] = imp_catagorical.fit_transform(df[df.columns[df.dtypes == "object"].values])
    
    numerical_features = df.columns[df.dtypes != "object"]
    poly = PolynomialFeatures(degree=3, interaction_only=True)
    polynomials = pd.DataFrame(poly
                               .fit_transform(df[['h1n1_concern','doctor_recc_h1n1','opinion_h1n1_vacc_effective']]),columns=["poly"+str(i) for i in range(8)],index=dataFrame.index)
    df = pd.concat([df,indicator, polynomials], axis=1)
    
    
    ordinal_features = ['education','income_poverty']
    category_map = {'education': [ 'missing_value','< 12 Years', '12 Years', 'Some College','College Graduate'], 
                    'income_poverty': ['missing_value', 'Below Poverty', '<= $75,000, Above Poverty', '> $75,000']}


    for i in ordinal_features:
        cat = pd.Categorical(df[i], categories= category_map.get(i),ordered=True)
        cat.fillna('missing_value')
        labels, unique = pd.factorize(cat, sort=True)
        df[i] = labels

    onehot = OneHotEncoder(dtype=np.int, sparse=True, handle_unknown='ignore')
    current_cat_col = df.columns[df.dtypes == "object"].values
    nominals = pd.DataFrame(onehot.fit_transform(df[current_cat_col]).toarray(),
                           columns= ["nominal"+str(i) for i in range(80)], index=dataFrame.index)

    df = df.drop(current_cat_col,axis=1)
    df = pd.concat([df, nominals], axis=1)
    
    allCols = df.columns.values
    for i in allCols:
        scaler = StandardScaler()
        df[i] = scaler.fit_transform(df[i].values.reshape(-1,1))
    features_df_norm_l2 = list(math.sqrt(sum(list((i**2) for i in df.iloc[r]))) 
                           for r in range(len(df)))
    print(len(features_df_norm_l2))
    print(df.shape)
    for index, row in df.iterrows():
        tempIndex = index
        if datatype == 1:
            tempIndex -= 26707
           
        
        rowt = row/features_df_norm_l2[tempIndex]
        df.loc[index] = rowt
    return df

In [66]:
features_dfx = pd.read_csv(
    DATA_PATH / "training_set_features.csv", 
    index_col="respondent_id"
)
labels_dfx = pd.read_csv(
    DATA_PATH / "training_set_labels.csv", 
    index_col="respondent_id"
)
features_dfx = formatFeatures(features_dfx)
X_train, X_eval, y_train, y_eval = train_test_split(
    features_dfx,
    labels_dfx['h1n1_vaccine'],
    test_size=0.33,
    shuffle=True,
    stratify=labels_dfx['h1n1_vaccine'],
    random_state=14
)




26707
(26707, 143)


In [None]:
#estimators = MultiOutputClassifier(
 #   estimator=MLPClassifier(solver='lbfgs', alpha=.55, hidden_layer_sizes=(4,2 ), random_state=1, max_iter=10000000)
#)
#estimators = MultiOutputClassifier(
 #   estimator=SVC(gamma='auto', probability=True),
#)
estimators =LogisticRegression(penalty="l2", C=1,solver='lbfgs',max_iter=1000000)

solver = 'lbfgs'
alpha = 32
hidden_layer_sizes=(3,2)
random_state=0
max_iter= 10000000

activation='tanh'
estimators=MLPClassifier(solver=solver, alpha=alpha, hidden_layer_sizes=hidden_layer_sizes, random_state=random_state,
                         max_iter=max_iter, activation=activation)

min_child_weight=0
max_depth=3
gamma=1.3
colsample_bytree=0.8
subsample=0.8
n_estimators=160
estimators = XGBClassifier(min_child_weight=min_child_weight,max_depth=max_depth, 
                           gamma=gamma,colsample_bytree=colsample_bytree,subsample=subsample,
                           n_estimators=n_estimators
                          )
#estimatorsX = AdaBoostClassifier(n_estimators=10, random_state=0, learning_rate = 1, 
 #                               algorithm='SAMME.R')

#estimatorsX = GradientBoostingClassifier(n_estimators=165, learning_rate=0.1, max_features=10, max_depth=6, random_state=11)

#estimators = BaggingClassifier(base_estimator=estimatorsX,n_estimators=50)

In [67]:
estimators.fit(X_train, y_train)
None

In [68]:
preds = estimators.predict_proba(X_eval)
y_preds = pd.DataFrame(
    preds,
    index = y_eval.index
)
print(roc_auc_score(y_eval, y_preds[1]))


0.8624704078529944


In [69]:
test_features_dfx = pd.read_csv(DATA_PATH / "test_set_features.csv", 
                               index_col="respondent_id")

test_features_dfx = formatFeatures(test_features_dfx,1)

test_probas = estimators.predict_proba(test_features_dfx)

test_probas

26708
(26708, 143)


array([[0.852651  , 0.14734901],
       [0.9808068 , 0.0191932 ],
       [0.8407397 , 0.1592603 ],
       ...,
       [0.8158726 , 0.18412738],
       [0.980899  , 0.01910101],
       [0.40992522, 0.5900748 ]], dtype=float32)

In [72]:
test_preds = pd.DataFrame(
    test_probas,
    index = test_features_dfx.index
)

In [78]:
submission_df = pd.read_csv(DATA_PATH / "submission_format.csv", 
                            index_col="respondent_id")

In [79]:
submission_df.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.5,0.7
26708,0.5,0.7
26709,0.5,0.7
26710,0.5,0.7
26711,0.5,0.7


In [80]:
# Make sure we have the rows in the same order
np.testing.assert_array_equal(test_features_dfx.index.values, 
                              submission_df.index.values)

# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = test_preds[1]

submission_df.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.147349,0.7
26708,0.019193,0.7
26709,0.15926,0.7
26710,0.66703,0.7
26711,0.278887,0.7


In [81]:
submission_df.to_csv(DATA_PATH / 'my_submission.csv', index=True)