In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.tree import DecisionTreeClassifier as DTC

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from utils import get_data, plot_cm

%matplotlib inline
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)

In [None]:
XY_train, X_test, inverse_target_map = get_data(min_size=None, min_size_test=None, fill_nan=-10)
train_columns = list(XY_train.columns)
train_columns.remove("TARGET_NUM")

In [None]:
df_result = pd.DataFrame({'A': []})

## TSNE

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(XY_train[train_columns].values)

df_result['tsne-2d-one'] = tsne_results[:, 0]
df_result['tsne-2d-two'] = tsne_results[:, 1] 

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="y",
    palette=sns.color_palette("hls", 10),
    data=df_result,
    legend="full",
    alpha=0.3
)

## PCA

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(XY_train[train_columns].values)

df_result['pca-one'] = pca_result[:, 0]
df_result['pca-two'] = pca_result[:, 1] 
df_result['pca-three'] = pca_result[:, 2]
df_result['y'] = XY_train["TARGET_NUM"]

In [None]:
df_result['y'] = XY_train["TARGET_NUM"]

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="pca-one", y="pca-two",
    hue="y",
    palette=sns.color_palette("hls", 19),
    data=df_result,
    legend="full",
    alpha=0.3
)

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="pca-one", y="pca-three",
    hue="y",
    palette=sns.color_palette("hls", 19),
    data=df_result,
    legend="full",
    alpha=0.3
)

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="pca-three", y="pca-two",
    hue="y",
    palette=sns.color_palette("hls", 19),
    data=df_result,
    legend="full",
    alpha=0.3
)

In [None]:
ax = plt.figure(figsize=(16,10)).gca(projection='3d')
ax.scatter(
    xs=df_result["pca-one"], 
    ys=df_result["pca-two"], 
    zs=df_result["pca-three"], 
    c=XY_train["TARGET_NUM"],
    cmap='tab10'
)
ax.set_xlabel('pca-one')
ax.set_ylabel('pca-two')
ax.set_zlabel('pca-three')
plt.show()

## SelectFromModel
https://towardsdatascience.com/5-feature-selection-method-from-scikit-learn-you-should-know-ed4d116e4172
https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html

In [6]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC, SVC

In [None]:
svm = LinearSVC(verbose=1).fit(XY_train[train_columns].values, XY_train["TARGET_NUM"].values)
selector = SelectFromModel(estimator=svm, prefit=True)

In [None]:
selector.estimator_.coef_

In [None]:
selector.get_support()

In [2]:
XY_train, X_test_ori, inverse_target_map = get_data(min_size=None, min_size_test=None, nan_thresh=10, fill_nan=None)

train_columns = list(XY_train.columns)
train_columns.remove("TARGET_NUM")

y_train_ori = XY_train["TARGET_NUM"].values
x_train = XY_train[train_columns].values

min_max_scaler = MinMaxScaler(feature_range=(0, 10)).fit(x_train)
x_train = min_max_scaler.transform(x_train)

imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1).fit(x_train)
x_train = imp.transform(x_train)

#x_train = filter_columns(x_train)

result = [x_train]
for i in range(2, 5):
    result.append(np.power(x_train, i))
    
x_train = np.concatenate(result, axis=1)

In [None]:
base_estimator = DTC(max_depth=15, random_state=42).fit(x_train, y_train_ori)
selector = SelectFromModel(estimator=base_estimator, prefit=True)

In [None]:
selector.get_support()

In [None]:
x_train_select = selector.transform(x_train)

In [None]:
dtc = DTC(max_depth=15, random_state=42).fit(x_train_select, y_train_ori)
print(dtc.score(x_train_select, y_train_ori))
plot_cm(dtc, x_train_select, y_train_ori)

In [None]:
x_train_full.shape

In [None]:
x_train_full[:, selector.get_support()].shape

In [7]:
XY_train, X_test, inverse_target_map = get_data(min_size=None, min_size_test=None, fill_nan=None)
train_columns = list(XY_train.columns)
train_columns.remove("TARGET_NUM")

min_max_scaler = MinMaxScaler(feature_range=(2, 10))
X_train_minmax = min_max_scaler.fit(XY_train[train_columns])
x_train = X_train_minmax.transform(XY_train[train_columns])

imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=1)
imp_train = imp.fit(x_train)
x_train_full = imp_train.transform(x_train)

x_train_full_df = pd.DataFrame(x_train_full, columns=train_columns, index=XY_train.index)

min_size = 150

for c in x_train_full_df.columns:
    if c != "TARGET_NUM":
        x_train_full_df[c][x_train_full_df.groupby(c)[c].transform('size') <= min_size] = 0

stand_scaler = StandardScaler()
X_train_stand = stand_scaler.fit(x_train_full_df[train_columns])
x_train_stand = X_train_stand.transform(x_train_full_df[train_columns])

In [None]:
svm = LinearSVC(verbose=1).fit(x_train_stand, XY_train["TARGET_NUM"].values)
selector = SelectFromModel(estimator=svm, prefit=True)

[LibLinear]....................................................................................................
optimization finished, #iter = 1000

Using -s 2 may be faster (also see FAQ)

Objective value = -482083.732015
nSV = 3603355
....................................................................................................
optimization finished, #iter = 1000

Using -s 2 may be faster (also see FAQ)

Objective value = -1763482.586410
nSV = 3412099
....................................................................................................
optimization finished, #iter = 1000

Using -s 2 may be faster (also see FAQ)

Objective value = -948385.127083
nSV = 3677469
....................................................................................................
optimization finished, #iter = 1000

Using -s 2 may be faster (also see FAQ)

Objective value = -652039.751042
nSV = 3671367
.......

In [None]:
svm = SVC(kernel="rbf", verbose=1).fit(x_train_stand, XY_train["TARGET_NUM"].values)
selector = SelectFromModel(estimator=svm, prefit=True)

[LibSVM]............................................................................................................