In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os



訓練データとテストデータの読み込み

In [None]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [None]:
train

In [None]:
test

In [None]:
#欠損値がないか確認（nullが1つもなければ以下のprint文でTrueが出力される）
print(np.all(train.isnull().sum() == 0))
print(np.all(test.isnull().sum() == 0))



In [None]:
#データ型を確認
print(train.dtypes)

In [None]:
categorical_features = [
    'workclass',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country']

In [None]:
def get_uniques(df, columns):
    uniques = dict()
    for column in columns:
        uniques[column] = list(df[column].unique())
    return uniques


In [None]:
get_uniques(train, categorical_features)

In [None]:
#ラベルエンコーディングする項目とOneHot encodingする項目を分割
label_encoding_features = ['sex']
one_hot_encoding_features = [
    'workclass', 
    'marital-status', 
    'occupation', 
    'relationship', 
    'race', 
    'native-country',
]


In [None]:
from sklearn import preprocessing
def label_encoder(df, columns):
    label_encoder = preprocessing.LabelEncoder()
    for column in columns:
        df[column] = label_encoder.fit_transform(df[column])
    return df


def onehot_encoder(df, columns):
    for column in columns:
        dummies = pd.get_dummies(df[column])
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df


In [None]:
train = label_encoder(train, label_encoding_features)
train = onehot_encoder(train, one_hot_encoding_features)

test = label_encoder(test, label_encoding_features)
test = onehot_encoder(test, one_hot_encoding_features)


In [None]:
train


In [None]:
test


In [None]:
#クロスバリデーションで評価
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state=1000)
for train_indices, val_indices in kf.split(train):
  print("train_indices", train_indices)
  print("val_indices", val_indices)



In [None]:
#列名と通し番号の対応を辞書として管理
namemap = {}
for i, column in enumerate(train.columns):
  if column == 'Y':   #正解列はそのままとする
    namemap[column] = 'Y'
  else:               #それ以外は通し番号に変更
    namemap[column] = str(i)
print(namemap)


In [None]:
numeric_cols = []
for col in train.columns:
  if col == 'Y':  # 正解列はそのままとする
    continue
  try:
    train[col].astype(float)
    numeric_cols.append(namemap[col])
  except:
    continue
print(numeric_cols)


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

n = 1000
clf = LinearDiscriminantAnalysis(n_components=1).fit(
    train.drop('Y', axis=1)[:n], train['Y'][:n]
    )
train_x_embedded = clf.transform(train.drop('Y', axis=1))
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111)
cm = train['Y']
cm = cm.replace(0, "red")
cm = cm.replace(1, "blue")
print(train_x_embedded)
#ax.scatter(train_x_embedded[:, 2], train_x_embedded[:, 0], train_x_embedded[:, 1], s = 40, c = cm)
ax.plot(train_x_embedded)
plt.show()


In [None]:
import lightgbm as lgb
import numpy as np

val_preds = np.zeros(len(train))

params = {
"objective" : "binary",
"metric" : "binary_logloss",
"learning_rate" : 0.001,
"num_iterations" : 20000,
"num_leaves" : 11,
"max_depth" : 4,
"min_data_in_leaf" : 0,
"bagging_fraction" : 0.8,
"bagging_freq" : 3,
"lambda_l2" : 1,
"lambda_l1" : 1,
"seed" : 42,
"n_jobs" : -1
}

models=[]

for train_indices, val_indeices in kf.split(train):
  train_kfold, val_kfold = train.iloc[train_indices], train.iloc[val_indeices]
  X_train, X_val = train_kfold.drop('Y', axis=1), val_kfold.drop('Y', axis=1)
  Y_train, Y_val = train_kfold['Y'], val_kfold['Y']
  X_train.columns = X_train.columns.map(namemap)
  X_val.columns = X_val.columns.map(namemap)

  train_dataset = lgb.Dataset(X_train[numeric_cols], Y_train)
  val_dataset = lgb.Dataset(X_val[numeric_cols], Y_val, reference=train_dataset)

  evaluation_results = {}
  model = lgb.train(
    params, 
    train_dataset, 
    valid_sets=[train_dataset, val_dataset],
    verbose_eval=10,
    early_stopping_rounds=1000,
    num_boost_round=10000,
    evals_result=evaluation_results,
    valid_names=['train', 'valid']
    )
  plt.plot(evaluation_results['train']['binary_logloss'], label='train')
  plt.plot(evaluation_results['valid']['binary_logloss'], label='valid')
  plt.ylabel('log less')
  plt.xlabel('boosting round')
  plt.title('training performance')
  plt.legend()
  plt.show()

  models.append(model)
  val_preds[val_indeices] = model.predict(X_val[numeric_cols])


In [None]:
from sklearn.metrics import accuracy_score
val_preds_result = np.where(val_preds<0.5, 0, 1)
print(val_preds_result)
accuracy_score(train['Y'], val_preds_result)


In [None]:
test_id = pd.read_csv("../data/test.csv")['id']
test.columns = test.columns.map(namemap)

X_test = test[numeric_cols]


In [None]:
preds=[]

for i, model in enumerate(models):
  preds.append(model.predict(X_test))

preds = np.array(preds)


In [None]:
preds.shape
print(preds)

preds_mean = preds.mean(axis = 0)
preds_mean = np.where(preds_mean < 0.5, 0, 1)
print(preds_mean)


In [None]:
submit = np.c_[test_id, preds_mean]

np.savetxt("submission.csv", submit, delimiter=',', fmt="%.0f")
