In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
content_df = pd.read_csv('../data/Train_Dataset.csv')
label_df = pd.read_csv('../data/Train_Dataset_Label.csv')
train_df = content_df.merge(label_df, on='id')
train_df = train_df.fillna('EMPTY')

In [22]:
test_df = pd.read_csv('../data/Test_Dataset.csv')
test_df = test_df.fillna('EMPTY')
test_title_data = pd.read_csv('./test_title_word_vector.csv', header=None).values
test_content_data = pd.read_csv('./test_content_word_vector.csv', header=None).values
test_tail_data = pd.read_csv('./test_tail_word_vector.csv', header=None).values
X_test = np.concatenate((test_title_data, test_content_data, test_tail_data), axis=1)

In [23]:
y = train_df['label'].values
train_title_data = pd.read_csv('./train_title_word_vector.csv', header=None).values
train_content_data = pd.read_csv('./train_content_word_vector.csv', header=None).values
train_tail_data = pd.read_csv('./train_tail_word_vector.csv', header=None).values
X = np.concatenate((train_title_data, train_content_data, train_tail_data), axis=1)

In [29]:
model = lgb.LGBMClassifier(learning_rate=0.1, n_estimators=100)

In [30]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

kf = KFold(5, shuffle=True, random_state=2019)
for index, (train_index, valid_index) in enumerate(kf.split(X, y)):
    print(index)
    X_train, X_valid, y_train, y_valid = X[train_index], X[valid_index], y[train_index], y[valid_index]
    model.fit(X=X_train, y=y_train, eval_set=(X_valid, y_valid), verbose=10)
    y_pred = model.predict(X_valid)
    valid_score = f1_score(y_valid, y_pred, average='macro')
    print(valid_score)

0
[10]	valid_0's multi_logloss: 0.768895
[20]	valid_0's multi_logloss: 0.66059
[30]	valid_0's multi_logloss: 0.592283
[40]	valid_0's multi_logloss: 0.544296
[50]	valid_0's multi_logloss: 0.512642
[60]	valid_0's multi_logloss: 0.488589
[70]	valid_0's multi_logloss: 0.471248
[80]	valid_0's multi_logloss: 0.458166
[90]	valid_0's multi_logloss: 0.448973
[100]	valid_0's multi_logloss: 0.440605
0.7339986656538774
1
[10]	valid_0's multi_logloss: 0.7515
[20]	valid_0's multi_logloss: 0.643626
[30]	valid_0's multi_logloss: 0.575992
[40]	valid_0's multi_logloss: 0.530146
[50]	valid_0's multi_logloss: 0.497139
[60]	valid_0's multi_logloss: 0.473081
[70]	valid_0's multi_logloss: 0.455587
[80]	valid_0's multi_logloss: 0.441997
[90]	valid_0's multi_logloss: 0.432054
[100]	valid_0's multi_logloss: 0.422949
0.7472889119017702
2
[10]	valid_0's multi_logloss: 0.751593
[20]	valid_0's multi_logloss: 0.643615
[30]	valid_0's multi_logloss: 0.576197
[40]	valid_0's multi_logloss: 0.531871
[50]	valid_0's multi_