# ZOMBIE FIRMS CLASSIFICATION

In [1]:
%matplotlib inline

# import warnings filter
from warnings import simplefilter

# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

## Prepare util functions

In [2]:
import lightgbm as lgb
from sklearn import metrics

def train_model(X_train, y_train, X_valid, y_valid, test=None, feature_cols=None, is_base=True):
    if feature_cols is None:
        feature_cols = X_train.columns.drop(["行业", "区域", "企业类型", "控制人类型"])
        
    dtrain = lgb.Dataset(X_train[feature_cols], label=y_train)
    dvalid = lgb.Dataset(X_valid[feature_cols], label=y_valid)
    
    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    num_round = 1000
    
    print("Training model!")
    bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], 
                    early_stopping_rounds=20, verbose_eval=False)
    
    valid_pred = bst.predict(X_valid[feature_cols])
    valid_pred = valid_pred > 0.5  
    valid_pred = valid_pred.astype(int)
    valid_score = metrics.roc_auc_score(y_valid, valid_pred)
    print("precision recall fscore support:")
    print(metrics.precision_recall_fscore_support(y_valid, valid_pred, average='micro'))
    print(f"Validation AUC score: {valid_score}")
    if test is not None: 
        test_pred = bst.predict(test[feature_cols])
        test_pred = test_pred > 0.5  
        test_pred = test_pred.astype(int)
        
        test_pred = test[['ID']].join(pd.DataFrame({'flag': test_pred}))
        
        if is_base:
            test_pred.to_csv('test_base.txt', sep=',', index=False)
        else:
            test_pred.to_csv('test_.txt', sep=',', index=False)
        return bst, valid_score
    else:
        return bst, valid_score

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## Load data

In [3]:
import pandas as pd

# load training data
all_data = pd.read_csv("data/train/train.csv")

# load testing data
test = pd.read_csv("data/test/test.csv")
test_base = pd.read_csv("data/test/base-test.csv")

test.count()
test_base.nunique()

ID         7132
注册时间         15
注册资本        990
行业            6
区域            7
企业类型          5
控制人类型         2
控制人持股比例      50
dtype: int64

## Deal with Categorical Data

#### Count Encoding for categorical variables

In [4]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.utils import shuffle

all_data_X = all_data[all_data.columns.drop(["flag"])]
all_data_y = all_data["flag"]

# shuffle data
all_data_X, all_data_y = shuffle(all_data_X, all_data_y)
# test = shuffle(test)

train_X, valid_X, train_y, valid_y = train_test_split(all_data_X, all_data_y, random_state=66)

In [5]:
cat_features = ["行业", "区域", "企业类型", "控制人类型"]
count_enc = ce.CountEncoder(cols=cat_features)

# Learn encoding from the training set
count_enc.fit(train_X[cat_features])

train_encoded_X = train_X.join(count_enc.transform(train_X[cat_features]).add_suffix("_count"))
valid_encoded_X = valid_X.join(count_enc.transform(valid_X[cat_features]).add_suffix("_count"))
test_encoded = test.join(count_enc.transform(test[cat_features]).add_suffix("_count"))

In [6]:
print("Baseline model")
_ = train_model(train_X, train_y, valid_X, valid_y, test, is_base=True) # Validation AUC score: 0.999999380520901

Baseline model
Training model!
precision recall fscore support:
(1.0, 1.0, 1.0, None)
Validation AUC score: 1.0


In [7]:
print("Count Encoding model")
_ = train_model(train_encoded_X, train_y, valid_encoded_X, valid_y, test_encoded, is_base=False) # Validation AUC score: 0.9999996696111472

Count Encoding model
Training model!
precision recall fscore support:
(1.0, 1.0, 1.0, None)
Validation AUC score: 1.0
