In [None]:
import pandas as pd
from sklearn import metrics
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [None]:
#Loading the Data : 
filename = "" #Replace with cleaned and pre-processed data
actorDF = pd.read_csv(filename)

In [None]:
#This is helpful if we have categorical data (anything that's not a number)
#I don't think we have that but I'll still include the code to handle it

#If any other columns need to be dropped include it here
X = actorDF.drop(['label'],axis=1)
y = actorDF.label

In [None]:
#This is if we have categorical data

for c in X.columns:
    col_type = X[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        X[c] = X[c].astype('category')
    if col_type == 'float64':
        X[c] = X[c].astype('int64')
X.info()

In [None]:
#Splits the data into train and test with an 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=320, stratify=y)

In [None]:
# This allows us to avoid overtraining and we do not need to optimise the number of trees
fit_params={"early_stopping_rounds":10, 
            "eval_metric" : 'logloss', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            'feature_name': 'auto', # that's actually the default
            'categorical_feature': 'auto' # that's actually the default
           }

In [None]:
#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 1000 define only the absolute maximum
clf = lgb.LGBMClassifier(num_leaves= 15, max_depth=-1, 
                         random_state=314,  
                         metric='None', 
                         n_jobs=4, 
                         n_estimators=1000,
                         colsample_bytree=0.9,
                         subsample=0.9,
                         learning_rate=0.1)

In [None]:
#force larger number of max trees and smaller learning rate
clf.fit(X_train, y_train, **fit_params)

In [None]:
print('Training accuracy {:.4f}'.format(clf.score(X_train,y_train)))
print('Testing accuracy {:.4f}'.format(clf.score(X_test,y_test)))

lgb.plot_importance(clf)
lgb.plot_metric(clf)