<a href="https://colab.research.google.com/github/klein-mask/signate/blob/main/competitions/no_413/gb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Mounted google drive and set project path

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
project_path = '/content/drive/MyDrive/ml/signate/no_413'

## 2. Load train data by pandas DataFrame

In [3]:
import pandas as pd
train = pd.read_csv(f'{project_path}/data/train.csv')

In [4]:
train.head(3)

Unnamed: 0,index,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,native-country,Y
0,322,21,Private,132652,Some-college,10,Divorced,Adm-clerical,Own-child,White,Female,United-States,0
1,11968,29,Private,132652,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,United-States,0
2,10868,19,Private,132652,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,United-States,0


## 3. Split x(features) and y(label)

In [5]:
x = train.drop('Y', axis=1)
y = train['Y']

In [6]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11900 entries, 0 to 11899
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   index           11900 non-null  int64 
 1   age             11900 non-null  int64 
 2   workclass       11900 non-null  object
 3   fnlwgt          11900 non-null  int64 
 4   education       11900 non-null  object
 5   education-num   11900 non-null  int64 
 6   marital-status  11900 non-null  object
 7   occupation      11900 non-null  object
 8   relationship    11900 non-null  object
 9   race            11900 non-null  object
 10  sex             11900 non-null  object
 11  native-country  11900 non-null  object
dtypes: int64(4), object(8)
memory usage: 1.1+ MB


## 4. Feature engineering

In [7]:
#x = x.drop('index', axis=1)

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

def preprocess(_data: pd.DataFrame):
    data = _data.copy()
    data = data.drop('index', axis=1)

    # ?はいっそ明らかに多いprivateにする
    max_workclass = data['workclass'].value_counts(sort=True).index[0]
    data['workclass'] = data['workclass'].replace('?', max_workclass)

    sex_mapping = {
        'Male': 0,
        'Female': 1
    }
    data['sex'] = data['sex'].map(sex_mapping)
    data['education'] = LabelEncoder().fit_transform(data['education'])
    data = data.drop('native-country', axis=1)
    data['occupation'] = data['occupation'].replace('?', 'None')

    ohe_columns = ['workclass', 'marital-status', 'occupation', 'relationship', 'race']
    data = pd.get_dummies(data, columns=ohe_columns)

    data = StandardScaler().fit_transform(data)

    return data

x = preprocess(x)

In [8]:
# x['workclass'].value_counts().plot.bar(figsize=(15, 3), color='darkblue')

In [9]:
# x['education'].value_counts().plot.bar(figsize=(15, 3), color='pink')

In [10]:
# x['education'].value_counts().plot.bar(figsize=(15, 3))

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [14]:
import lightgbm as lgb

lgb_train = lgb.Dataset(x_train, y_train)
lgb_test = lgb.Dataset(x_test, y_test)

lgb_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary'
}


lgb_results = {}

model = lgb.train(
    params=lgb_params,
    train_set=lgb_train,
    valid_sets=[lgb_train, lgb_test],
    valid_names=['Train', 'Test'],
    num_boost_round=100,
    early_stopping_rounds=10,
    evals_result=lgb_results
)

[1]	Train's binary_logloss: 0.529232	Test's binary_logloss: 0.528213
Training until validation scores don't improve for 10 rounds.
[2]	Train's binary_logloss: 0.498134	Test's binary_logloss: 0.498509
[3]	Train's binary_logloss: 0.47352	Test's binary_logloss: 0.47493
[4]	Train's binary_logloss: 0.453022	Test's binary_logloss: 0.455226
[5]	Train's binary_logloss: 0.435839	Test's binary_logloss: 0.43892
[6]	Train's binary_logloss: 0.421311	Test's binary_logloss: 0.424948
[7]	Train's binary_logloss: 0.408981	Test's binary_logloss: 0.413278
[8]	Train's binary_logloss: 0.3984	Test's binary_logloss: 0.403539
[9]	Train's binary_logloss: 0.389123	Test's binary_logloss: 0.395324
[10]	Train's binary_logloss: 0.381343	Test's binary_logloss: 0.388059
[11]	Train's binary_logloss: 0.374376	Test's binary_logloss: 0.381936
[12]	Train's binary_logloss: 0.368353	Test's binary_logloss: 0.376748
[13]	Train's binary_logloss: 0.363088	Test's binary_logloss: 0.3722
[14]	Train's binary_logloss: 0.358382	Test's

In [15]:
import numpy as np
print(np.min(lgb_results['Train']['binary_logloss']))
print(np.min(lgb_results['Test']['binary_logloss']))
best_iteration = model.best_iteration
print(best_iteration)

0.3074408505477516
0.34064725310784655
40


In [16]:
y_pred = model.predict(x_test, num_iteration=model.best_iteration)

In [17]:
y_pred

array([0.05807381, 0.46130942, 0.01579358, ..., 0.12843733, 0.03029955,
       0.02972164])

In [18]:
y_pred = [0 if i < 0.5 else 1 for i in y_pred]

In [19]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print(acc)

0.8436974789915966


In [20]:
model.save_model(f'{project_path}/data/lgb-v1.h5')

<lightgbm.basic.Booster at 0x7fcf86387ad0>

In [21]:
test = pd.read_csv(f'{project_path}/data/test.csv')

In [22]:
test.head(3)

Unnamed: 0,index,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,native-country
0,3873,17,Local-gov,132652,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States
1,3625,23,Private,132652,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,United-States
2,3028,19,Private,132652,11th,7,Never-married,Handlers-cleaners,Own-child,White,Female,United-States


In [23]:
test_idx = test['index']
test_fe = preprocess(test)

In [25]:
test_fe

array([[-1.16440708,  0.12289487,  0.41215588, ..., -0.14498628,
        -0.24056561,  0.28535055],
       [-0.52378325,  0.12289487,  1.17430054, ..., -0.14498628,
        -0.24056561,  0.28535055],
       [-0.9508658 ,  0.12289487, -3.01749511, ..., -0.14498628,
        -0.24056561,  0.28535055],
       ...,
       [ 0.86423504,  0.12289487, -1.49320578, ..., -0.14498628,
        -0.24056561,  0.28535055],
       [ 0.54392312,  0.12289487,  0.79322821, ..., -0.14498628,
        -0.24056561,  0.28535055],
       [ 0.86423504,  0.12289487, -0.34998878, ..., -0.14498628,
        -0.24056561,  0.28535055]])

In [26]:
y_test_pred = model.predict(test_fe, num_iteration=model.best_iteration)

In [27]:
y_test_pred

array([0.70748324, 0.3141271 , 0.00636914, ..., 0.25283694, 0.03671712,
       0.65170071])

In [28]:
y_test_pred = [0 if i < 0.5 else 1 for i in y_test_pred]

In [29]:
res = pd.DataFrame()
res['index'] = test_idx
res['label'] = y_test_pred

In [30]:
res.head(3)

Unnamed: 0,index,label
0,3873,1
1,3625,0
2,3028,0


In [32]:
res.to_csv(f'{project_path}/data/lgb-prediction-v1.csv', header=False)