In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures
from feature_engine.selection import DropCorrelatedFeatures, SmartCorrelatedSelection

In [3]:
X_train = pd.read_csv("Xtrain_k_one_hot_scaled.csv")
X_test = pd.read_csv("Xtest_k_one_hot_scaled.csv")
ytrain = pd.read_csv("ytrain.csv")
ytest = pd.read_csv("ytest.csv")
print("Shape of X Train: {}".format(X_train.shape))
print("Shape of X Test: {}".format(X_test.shape))
print("Shape of y Train: {}".format(ytrain.shape))
print("Shape of y Test: {}".format(ytest.shape))

Shape of X Train: (8672, 76)
Shape of X Test: (2168, 76)
Shape of y Train: (8672, 1)
Shape of y Test: (2168, 1)


In [4]:
X_train

Unnamed: 0,Reviews,Rating,days_since_update,Price,Category_TRAVEL_AND_LOCAL,Category_VIDEO_PLAYERS,Category_FINANCE,Category_FAMILY,Category_MEDICAL,Category_GAME,...,Genres_Productivity,Genres_Books & Reference,Genres_Shopping,Genres_Puzzle,Genres_Casual,Genres_News & Magazines,Genres_Sports,Genres_Action,Genres_Simulation,Genres_Food & Drink
0,-0.144827,0.403271,-0.603588,0.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.150937,-1.053274,0.637297,0.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.151044,1.651738,3.887354,0.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.150943,1.027505,-0.613697,0.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.122954,0.611349,-0.570733,0.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8667,-0.017494,0.611349,-0.633915,0.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8668,-0.141238,-0.220963,0.619606,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8669,-0.012583,0.195193,-0.651606,0.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8670,-0.150341,1.027505,-0.454479,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8672 entries, 0 to 8671
Data columns (total 76 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Reviews                         8672 non-null   float64
 1   Rating                          8672 non-null   float64
 2   days_since_update               8672 non-null   float64
 3   Price                           8672 non-null   float64
 4   Category_TRAVEL_AND_LOCAL       8672 non-null   int64  
 5   Category_VIDEO_PLAYERS          8672 non-null   int64  
 6   Category_FINANCE                8672 non-null   int64  
 7   Category_FAMILY                 8672 non-null   int64  
 8   Category_MEDICAL                8672 non-null   int64  
 9   Category_GAME                   8672 non-null   int64  
 10  Category_SOCIAL                 8672 non-null   int64  
 11  Category_PERSONALIZATION        8672 non-null   int64  
 12  Category_PHOTOGRAPHY            86

## Constant and Quasi-constant features
Mithilfe des untenstehenden Befehls löschen wir quasi-konstante und konstante Variablen.
Wenn ich nur konstante löschen möchte, muss ich die Toleranz auf 1 setzen.

In [4]:
sel = DropConstantFeatures(tol=1, variables=None, missing_values='raise') # quasi-constant drop as tol is not equal to 1, with tol = 1 we get Constant feature dropping

sel.fit(X_train)

In [5]:
# number of quasi-constant features

len(sel.features_to_drop_)

0

In [6]:
# list of quasi-constant features

sel.features_to_drop_

[]

In [7]:
#remove the quasi-constant features

X_train_trans = sel.transform(X_train)
X_test_trans = sel.transform(X_test)

X_train_trans.shape, X_test_trans.shape

((8672, 76), (2168, 76))

## Duplicates
Doppelte Werte enthalten die gleiche Information, aus dem Grund wollen wir diese entfernen. Dies machen wir nun mit folgnedem Befehl.

In [8]:
# set up the selector
sel = DropDuplicateFeatures(variables=None, missing_values='raise')

# find the duplicate features, this might take a while
sel.fit(X_train_trans)

In [9]:
# these are the pairs of duplicated features
# each set are duplicates

sel.duplicated_feature_sets_

[{'Category_FINANCE', 'Genres_Finance'},
 {'Category_MEDICAL', 'Genres_Medical'},
 {'Category_SOCIAL', 'Genres_Social'},
 {'Category_PERSONALIZATION', 'Genres_Personalization'},
 {'Category_PHOTOGRAPHY', 'Genres_Photography'},
 {'Category_MAPS_AND_NAVIGATION', 'Genres_Maps & Navigation'},
 {'Category_HEALTH_AND_FITNESS', 'Genres_Health & Fitness'},
 {'Category_COMMUNICATION', 'Genres_Communication'},
 {'Category_DATING', 'Genres_Dating'},
 {'Category_BUSINESS', 'Genres_Business'},
 {'Category_PRODUCTIVITY', 'Genres_Productivity'},
 {'Category_BOOKS_AND_REFERENCE', 'Genres_Books & Reference'},
 {'Category_SHOPPING', 'Genres_Shopping'},
 {'Category_NEWS_AND_MAGAZINES', 'Genres_News & Magazines'},
 {'Category_FOOD_AND_DRINK', 'Genres_Food & Drink'}]

In [10]:
# these are the features that will be dropped
# 1 from each of the pairs above

sel.features_to_drop_

{'Genres_Books & Reference',
 'Genres_Business',
 'Genres_Communication',
 'Genres_Dating',
 'Genres_Finance',
 'Genres_Food & Drink',
 'Genres_Health & Fitness',
 'Genres_Maps & Navigation',
 'Genres_Medical',
 'Genres_News & Magazines',
 'Genres_Personalization',
 'Genres_Photography',
 'Genres_Productivity',
 'Genres_Shopping',
 'Genres_Social'}

In [11]:
# remove the duplicated features

X_train_trans = sel.transform(X_train_trans)
X_test_trans = sel.transform(X_test_trans)

X_train_trans.shape, X_test_trans.shape

((8672, 61), (2168, 61))

## Correlation
### Brut force approach

In [12]:
# set up the selector

sel = DropCorrelatedFeatures(
    threshold=0.8,
    method='pearson',
    missing_values='ignore'
)


# find correlated features

sel.fit(X_train_trans)

In [13]:
# each set contains a group of correlated features

sel.correlated_feature_sets_

[{'Category_TRAVEL_AND_LOCAL', 'Genres_Travel & Local'},
 {'Category_VIDEO_PLAYERS', 'Genres_Video Players & Editors'},
 {'Category_TOOLS', 'Genres_Tools'},
 {'Category_LIFESTYLE', 'Genres_Lifestyle'},
 {'Category_SPORTS', 'Genres_Sports'},
 {'Type_Free', 'Type_Paid'}]

In [14]:
# the transformer selects 1 feature from each group.
# the rest will be removed and can be found in this attribute

len(sel.features_to_drop_)

6

In [15]:
# drop correlated features

X_train_trans_brut = sel.transform(X_train_trans)
X_test_trans_brut = sel.transform(X_test_trans)

print("Die formate vor der Transformation sind: {}".format(X_train.shape))
print("Die formate nach der Transformation sind: {}".format(X_train_trans_brut.shape))

Die formate vor der Transformation sind: (8672, 76)
Die formate nach der Transformation sind: (8672, 55)


### Smart Method
Hier werden wir nun die smartere Methode mit der Varianz sehen.

In [16]:
# correlation selector

sel = SmartCorrelatedSelection(
    variables=None,
    method="pearson",
    threshold=0.8,
    missing_values="raise",
    selection_method="variance",
    estimator=None,
    scoring="roc_auc",
    cv=3,
)

sel.fit(X_train_trans, ytrain)

In [17]:
sel.correlated_feature_sets_

[{'Category_TRAVEL_AND_LOCAL', 'Genres_Travel & Local'},
 {'Category_VIDEO_PLAYERS', 'Genres_Video Players & Editors'},
 {'Category_TOOLS', 'Genres_Tools'},
 {'Category_LIFESTYLE', 'Genres_Lifestyle'},
 {'Category_SPORTS', 'Genres_Sports'},
 {'Type_Free', 'Type_Paid'}]

In [18]:
# let's examine the variance of the features from the second group of
# correlated ones

group = sel.correlated_feature_sets_[0]

X_train_trans[group].std()

  X_train_trans[group].std()


Category_TRAVEL_AND_LOCAL    0.153370
Genres_Travel & Local        0.153012
dtype: float64

In [19]:
# let's examine the variance of the features from the second group of
# correlated ones

group = sel.correlated_feature_sets_[1]

X_train_trans[group].std()

  X_train_trans[group].std()


Genres_Video Players & Editors    0.120133
Category_VIDEO_PLAYERS            0.121061
dtype: float64

In [20]:
X_train_trans.shape

(8672, 61)

In [21]:
X_train_trans_smart= sel.transform(X_train_trans)
X_test_trans_smart= sel.transform(X_test_trans)

In [22]:
X_train_trans_smart.shape

(8672, 55)

## Speicherung der Daten

In [23]:
X_train_trans_smart.to_csv("Xtrain_feature_sel_high_tol.csv",index=False)
X_test_trans_smart.to_csv("Xtest_feature_sel_high_tol.csv",index=False)

Das ganze Skript haben wir zwei Mal durchlaufen lassen, einmal mit der Toleranz 0.95 und einmal mit der Toleranz 1. Die Files heissen "Xtrain_feature_sel_low_tol.csv" bzw. "Xtrain_feature_sel_high_tol.csv"