In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures
from feature_engine.selection import DropCorrelatedFeatures, SmartCorrelatedSelection

In [25]:
X_train = pd.read_csv("Xtrain_k_one_hot_scaled.csv")
X_test = pd.read_csv("Xtest_k_one_hot_scaled.csv")
ytrain = pd.read_csv("ytrain.csv")
ytest = pd.read_csv("ytest.csv")
print("Shape of X Train: {}".format(X_train.shape))
print("Shape of X Test: {}".format(X_test.shape))
print("Shape of y Train: {}".format(ytrain.shape))
print("Shape of y Test: {}".format(ytest.shape))

Shape of X Train: (8672, 76)
Shape of X Test: (2168, 76)
Shape of y Train: (8672, 1)
Shape of y Test: (2168, 1)


In [26]:
X_train

Unnamed: 0,Reviews,Rating,days_since_update,Price,Category_TRAVEL_AND_LOCAL,Category_VIDEO_PLAYERS,Category_FINANCE,Category_FAMILY,Category_MEDICAL,Category_GAME,...,Genres_Productivity,Genres_Books & Reference,Genres_Shopping,Genres_Puzzle,Genres_Casual,Genres_News & Magazines,Genres_Sports,Genres_Action,Genres_Simulation,Genres_Food & Drink
0,-0.144827,0.403271,-0.603588,0.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.150937,-1.053274,0.637297,0.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.151044,1.651738,3.887354,0.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.150943,1.027505,-0.613697,0.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.122954,0.611349,-0.570733,0.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8667,-0.017494,0.611349,-0.633915,0.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8668,-0.141238,-0.220963,0.619606,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8669,-0.012583,0.195193,-0.651606,0.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8670,-0.150341,1.027505,-0.454479,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Constant and Quasi-constant features
Mithilfe des untenstehenden Befehls löschen wir quasi-konstante und konstante Variablen.
Wenn ich nur konstante löschen möchte, muss ich die Toleranz auf 1 setzen.

In [27]:
sel = DropConstantFeatures(tol=0.95, variables=None, missing_values='raise') # quasi-constant drop as tol is not equal to 1, with tol = 1 we get Constant feature dropping

sel.fit(X_train)

In [28]:
# number of quasi-constant features

len(sel.features_to_drop_)

55

In [29]:
# list of quasi-constant features

sel.features_to_drop_

['Category_TRAVEL_AND_LOCAL',
 'Category_VIDEO_PLAYERS',
 'Category_FINANCE',
 'Category_MEDICAL',
 'Category_SOCIAL',
 'Category_PERSONALIZATION',
 'Category_PHOTOGRAPHY',
 'Category_MAPS_AND_NAVIGATION',
 'Category_HEALTH_AND_FITNESS',
 'Category_COMMUNICATION',
 'Category_DATING',
 'Category_LIFESTYLE',
 'Category_BUSINESS',
 'Category_PRODUCTIVITY',
 'Category_BOOKS_AND_REFERENCE',
 'Category_SHOPPING',
 'Category_EDUCATION',
 'Category_NEWS_AND_MAGAZINES',
 'Category_SPORTS',
 'Category_FOOD_AND_DRINK',
 'Category_ENTERTAINMENT',
 'Size_40.1-50MB',
 'Size_90.1-100MB',
 'Size_50.1-60MB',
 'Size_80.1-90MB',
 'Size_60.1-70MB',
 'Size_70.1-80MB',
 'Content Rating_Everyone 10+',
 'Content Rating_Mature 17+',
 'Content Rating_Rare',
 'Genres_Travel & Local',
 'Genres_Video Players & Editors',
 'Genres_Finance',
 'Genres_Role Playing',
 'Genres_Medical',
 'Genres_Arcade',
 'Genres_Social',
 'Genres_Personalization',
 'Genres_Photography',
 'Genres_Maps & Navigation',
 'Genres_Health & Fi

In [30]:
#remove the quasi-constant features

X_train_trans = sel.transform(X_train)
X_test_trans = sel.transform(X_test)

X_train_trans.shape, X_test_trans.shape

((8672, 21), (2168, 21))

## Duplicates
Doppelte Werte enthalten die gleiche Information, aus dem Grund wollen wir diese entfernen. Dies machen wir nun mit folgnedem Befehl.

In [31]:
# set up the selector
sel = DropDuplicateFeatures(variables=None, missing_values='raise')

# find the duplicate features, this might take a while
sel.fit(X_train_trans)

In [32]:
# these are the pairs of duplicated features
# each set are duplicates

sel.duplicated_feature_sets_

[]

In [33]:
# these are the features that will be dropped
# 1 from each of the pairs above

sel.features_to_drop_

set()

In [34]:
# remove the duplicated features

X_train_trans = sel.transform(X_train_trans)
X_test_trans = sel.transform(X_test_trans)

X_train_trans.shape, X_test_trans.shape

((8672, 21), (2168, 21))

## Correlation
### Brut force approach

In [35]:
# set up the selector

sel = DropCorrelatedFeatures(
    threshold=0.8,
    method='pearson',
    missing_values='ignore'
)


# find correlated features

sel.fit(X_train_trans)

In [36]:
# each set contains a group of correlated features

sel.correlated_feature_sets_

[{'Category_TOOLS', 'Genres_Tools'}, {'Type_Free', 'Type_Paid'}]

In [37]:
# the transformer selects 1 feature from each group.
# the rest will be removed and can be found in this attribute

len(sel.features_to_drop_)

2

In [38]:
# drop correlated features

X_train_trans_brut = sel.transform(X_train_trans)
X_test_trans_brut = sel.transform(X_test_trans)

print("Die formate vor der Transformation sind: {}".format(X_train.shape))
print("Die formate nach der Transformation sind: {}".format(X_train_trans_brut.shape))

Die formate vor der Transformation sind: (8672, 76)
Die formate nach der Transformation sind: (8672, 19)


### Smart Method
Hier werden wir nun die smartere Methode mit der Varianz sehen.

In [39]:
# correlation selector

sel = SmartCorrelatedSelection(
    variables=None,
    method="pearson",
    threshold=0.8,
    missing_values="raise",
    selection_method="variance",
    estimator=None,
    scoring="roc_auc",
    cv=3,
)

sel.fit(X_train_trans, ytrain)

In [40]:
sel.correlated_feature_sets_

[{'Category_TOOLS', 'Genres_Tools'}, {'Type_Free', 'Type_Paid'}]

In [41]:
# let's examine the variance of the features from the second group of
# correlated ones

group = sel.correlated_feature_sets_[0]

X_train_trans[group].std()

  X_train_trans[group].std()


Category_TOOLS    0.270994
Genres_Tools      0.270815
dtype: float64

In [42]:
# let's examine the variance of the features from the second group of
# correlated ones

group = sel.correlated_feature_sets_[1]

X_train_trans[group].std()

  X_train_trans[group].std()


Type_Free    0.263704
Type_Paid    0.263704
dtype: float64

In [43]:
X_train_trans.shape

(8672, 21)

In [44]:
X_train_trans_smart= sel.transform(X_train_trans)
X_test_trans_smart= sel.transform(X_test_trans)

In [45]:
X_train_trans_smart.shape

(8672, 19)

## Speicherung der Daten

In [46]:
X_train_trans_smart.to_csv("Xtrain_feature_sel.csv",index=False)
X_test_trans_smart.to_csv("Xtest_feature_sel.csv",index=False)