## Getting the data in the right format

In [None]:
# We will be using labelencoder from sklearn
# Remember the difference between labelencoder and onehotencoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
column = ['Sex','Embarked','Title']
train = entire_set[0]    
train_feature_importance = train.copy()
test = entire_set[1]
test_feature_importance = test.copy

for i in column:
    train_feature_importance[i] = le.fit_transform(train_feature_importance[i])


In [None]:
X = train_feature_importance.loc[:, ~train_feature_importance.columns.isin(['Survived','Name'])]
y = train_feature_importance['Survived']

## Working out the actual importances of the values

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV

#initialize an empty array to store feature importance
feature_importances = np.zeros(X.shape[1])

#initialize model
model = lgb.LGBMClassifier(objective='binary',boosting_type='goss', n_estimators=10000, class_weight='balanced')


for i in range(2):
    # splitting train dataset into train and validation set
    train_features, valid_features, train_y, valid_y = train_test_split(X,y, test_size = 0.2, random_state=42)
    
    # train model using early stopping
    model.fit(train_features, train_y, early_stopping_rounds=100, eval_set=[(valid_features, valid_y)], eval_metric='auc', verbose=200)
    
    #record the feature importances
    feature_importances += model.feature_importances_

feature_importances = feature_importances / 2
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': feature_importances}).sort_values('importance', ascending = False)

## Function to plot feature importance

In [None]:
def plot_feature_importances(df, threshold = 0.9):
    """
    Plots 15 most important features and the cumulative importance of features.
    Prints the number of features needed to reach threshold cumulative importance.
    
    Parameters
    --------
    df : dataframe
        Dataframe of feature importances. Columns must be feature and importance
    threshold : float, default = 0.9
        Threshold for prining information about cumulative importances
        
    Return
    --------
    df : dataframe
        Dataframe ordered by feature importances with a normalized column (sums to 1)
        and a cumulative importance column
    
    """
    
    plt.rcParams['font.size'] = 18
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    df['cumulative_importance'] = np.cumsum(df['importance_normalized'])
    
     # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10, 6))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance_normalized'].head(15), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    
    # Cumulative importance plot
    plt.figure(figsize = (8, 6))
    plt.plot(list(range(len(df))), df['cumulative_importance'], 'r-')
    plt.xlabel('Number of Features'); plt.ylabel('Cumulative Importance'); 
    plt.title('Cumulative Feature Importance');
    plt.show();
    
    importance_index = np.min(np.where(df['cumulative_importance'] > threshold))
    print('%d features required for %0.2f of cumulative importance' % (importance_index + 1, threshold))
    
    return df   

In [None]:
plot_feature_importances(feature_importances)