In [2]:
import warnings
warnings.filterwarnings("ignore")

from articlecommon import *
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import RFECV

psc_ohe = pd.get_dummies(X_le["Policy_Sales_Channel"], prefix="Policy_Sales_Channel")

counts = X_le["Policy_Sales_Channel"].value_counts()
counts = counts[counts == 1]
columns_to_remove = list(map(lambda x: "Policy_Sales_Channel_" + str(x), counts.index.tolist()))
psc_ohe = psc_ohe.drop(columns=columns_to_remove)

X_le_psc = pd.concat([X_le, psc_ohe], axis=1)
lgbm = LGBMClassifier(random_state=42, verbose=-1)

In [3]:
from sklearn.pipeline import Pipeline

def choose_columns(X, y, model, model_name):
    selector = RFECV(model, step=1, cv=10, scoring='roc_auc', n_jobs=1)
    # selector = selector.fit(X_le_psc, y)
    selector = selector.fit(X, y)
    print("Model: " + model_name)
    print("Number of columns: " + str(selector.n_features_))
    columns = X.columns[selector.support_]
    print("Columns: " + str(columns))
    return pd.Series(columns)

class MyPipeline(Pipeline):
    @property
    def coef_(self):
        return self._final_estimator.coef_
    @property
    def feature_importances_(self):
        return self._final_estimator.feature_importances_

model = MyPipeline([
    ('transformer', QuantileTransformer(output_distribution='normal')),
    ('lgbm', lgbm)
])

In [None]:
columns1 = choose_columns(X_le_psc, y, lgbm, "LGBM without scaler")
columns1.to_csv('results/columns1.csv')
columns1

In [6]:
scaler = StandardScaler()
model = MyPipeline([
    ('scaler', scaler),
    ('lgbm', lgbm)
])
columns2 = choose_columns(X_le_psc, y, model, "LGBM with standard scaler")
columns2.to_csv('results/columns2.csv')
columns2

Model: LGBM with standard scaler
Number of columns: 25
Columns: Index(['Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Policy_Sales_Channel_3.0',
       'Policy_Sales_Channel_11.0', 'Policy_Sales_Channel_25.0',
       'Policy_Sales_Channel_26.0', 'Policy_Sales_Channel_29.0',
       'Policy_Sales_Channel_61.0', 'Policy_Sales_Channel_122.0',
       'Policy_Sales_Channel_124.0', 'Policy_Sales_Channel_151.0',
       'Policy_Sales_Channel_152.0', 'Policy_Sales_Channel_154.0',
       'Policy_Sales_Channel_155.0', 'Policy_Sales_Channel_156.0',
       'Policy_Sales_Channel_157.0', 'Policy_Sales_Channel_160.0'],
      dtype='object')


0                         Gender
1                            Age
2                Driving_License
3                    Region_Code
4             Previously_Insured
5                    Vehicle_Age
6                 Vehicle_Damage
7                 Annual_Premium
8           Policy_Sales_Channel
9                        Vintage
10      Policy_Sales_Channel_3.0
11     Policy_Sales_Channel_11.0
12     Policy_Sales_Channel_25.0
13     Policy_Sales_Channel_26.0
14     Policy_Sales_Channel_29.0
15     Policy_Sales_Channel_61.0
16    Policy_Sales_Channel_122.0
17    Policy_Sales_Channel_124.0
18    Policy_Sales_Channel_151.0
19    Policy_Sales_Channel_152.0
20    Policy_Sales_Channel_154.0
21    Policy_Sales_Channel_155.0
22    Policy_Sales_Channel_156.0
23    Policy_Sales_Channel_157.0
24    Policy_Sales_Channel_160.0
dtype: object

In [7]:
transformer = PowerTransformer()
model = MyPipeline([
    ('transformer', transformer),
    ('lgbm', lgbm)
])
columns3 = choose_columns(X_le_psc, y, model, "LGBM with transformer")
columns3.to_csv('results/columns3.csv')
columns3

KeyboardInterrupt: 

In [None]:
transformer = PowerTransformer()
scaler_without_std = StandardScaler(with_std=False)
model = MyPipeline([
    ('scaler', scaler_without_std),
    ('transformer', transformer),
    ('lgbm', lgbm)
])
columns4 = choose_columns(X_le_psc, y, model, "LGBM with scaler and transformer")
columns4.to_csv('results/columns4.csv')
columns4

In [5]:
quant_trans_uniform = QuantileTransformer(output_distribution='uniform')
model = MyPipeline([
    ('transformer', quant_trans_uniform),
    ('lgbm', lgbm)
])
columns5 = choose_columns(X_le_psc, y, model, "LGBM with QuantileTransformer (uniform distribution)")
columns5.to_csv('results/columns5.csv')
columns5

Model: LGBM with QuantileTransformer (uniform distribution)
Number of columns: 28
Columns: Index(['Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Policy_Sales_Channel_3.0',
       'Policy_Sales_Channel_11.0', 'Policy_Sales_Channel_22.0',
       'Policy_Sales_Channel_25.0', 'Policy_Sales_Channel_26.0',
       'Policy_Sales_Channel_29.0', 'Policy_Sales_Channel_31.0',
       'Policy_Sales_Channel_52.0', 'Policy_Sales_Channel_61.0',
       'Policy_Sales_Channel_122.0', 'Policy_Sales_Channel_124.0',
       'Policy_Sales_Channel_151.0', 'Policy_Sales_Channel_152.0',
       'Policy_Sales_Channel_154.0', 'Policy_Sales_Channel_155.0',
       'Policy_Sales_Channel_156.0', 'Policy_Sales_Channel_157.0',
       'Policy_Sales_Channel_160.0'],
      dtype='object')


0                         Gender
1                            Age
2                Driving_License
3                    Region_Code
4             Previously_Insured
5                    Vehicle_Age
6                 Vehicle_Damage
7                 Annual_Premium
8           Policy_Sales_Channel
9                        Vintage
10      Policy_Sales_Channel_3.0
11     Policy_Sales_Channel_11.0
12     Policy_Sales_Channel_22.0
13     Policy_Sales_Channel_25.0
14     Policy_Sales_Channel_26.0
15     Policy_Sales_Channel_29.0
16     Policy_Sales_Channel_31.0
17     Policy_Sales_Channel_52.0
18     Policy_Sales_Channel_61.0
19    Policy_Sales_Channel_122.0
20    Policy_Sales_Channel_124.0
21    Policy_Sales_Channel_151.0
22    Policy_Sales_Channel_152.0
23    Policy_Sales_Channel_154.0
24    Policy_Sales_Channel_155.0
25    Policy_Sales_Channel_156.0
26    Policy_Sales_Channel_157.0
27    Policy_Sales_Channel_160.0
dtype: object

In [4]:
model = MyPipeline([
    ('transformer', QuantileTransformer(output_distribution='normal')),
    ('lgbm', lgbm)
])
columns6 = choose_columns(X_le_psc, y, model, "LGBM with QuantileTransformer (distribution normal)")
columns6.to_csv('results/columns6.csv')
columns6

Model: LGBM with QuantileTransformer (distribution normal)
Number of columns: 120
Columns: Index(['Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage',
       ...
       'Policy_Sales_Channel_151.0', 'Policy_Sales_Channel_152.0',
       'Policy_Sales_Channel_153.0', 'Policy_Sales_Channel_154.0',
       'Policy_Sales_Channel_155.0', 'Policy_Sales_Channel_156.0',
       'Policy_Sales_Channel_157.0', 'Policy_Sales_Channel_158.0',
       'Policy_Sales_Channel_159.0', 'Policy_Sales_Channel_160.0'],
      dtype='object', length=120)


0                          Gender
1                             Age
2                 Driving_License
3                     Region_Code
4              Previously_Insured
                  ...            
115    Policy_Sales_Channel_156.0
116    Policy_Sales_Channel_157.0
117    Policy_Sales_Channel_158.0
118    Policy_Sales_Channel_159.0
119    Policy_Sales_Channel_160.0
Length: 120, dtype: object