## Read datasets

In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

artificial

In [69]:
def read_artificial_data(dir_path="."):
    art_train_data = pd.read_csv(dir_path+'/artificial_train.data', sep=' ', header=None)
    art_train_data = art_train_data.drop(art_train_data.columns[500], axis=1)
    art_train_labels = pd.read_csv(dir_path+'/artificial_train.labels', sep=' ', header=None)
    art_train_X, art_val_X, art_train_y, art_val_y = train_test_split(art_train_data, art_train_labels, test_size=0.2, random_state=42)
    art_test_data = pd.read_csv(dir_path+'/artificial_valid.data', sep=' ', header=None)
    art_test_data = art_test_data.drop(art_test_data.columns[500], axis=1)
    minmax=MinMaxScaler()
    art_train_X=pd.DataFrame(minmax.fit_transform(art_train_X))
    art_val_X=pd.DataFrame(minmax.transform(art_val_X))
    art_test_data=pd.DataFrame(minmax.transform(art_test_data))
    return art_train_X, art_train_y, art_val_X, art_val_y, art_test_data

In [70]:
art_train_X, art_train_y, art_val_X, art_val_y, art_test_data = read_artificial_data()
art_train_X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.780488,0.43379,0.320423,0.138462,0.451505,0.456522,0.454902,0.6,0.745283,0.583333,...,0.620155,0.275862,0.805714,0.169666,0.477733,0.454148,0.744186,0.52,0.522222,0.322222
1,0.658537,0.3379,0.383803,0.723077,0.571906,0.304348,0.733333,0.4,0.490566,0.479167,...,0.573643,0.448276,0.44,0.613111,0.611336,0.406114,0.27907,0.7,0.640741,0.511111
2,0.463415,0.392694,0.799296,0.384615,0.327759,0.521739,0.470588,0.4,0.632075,0.729167,...,0.767442,0.344828,0.64,0.520566,0.441296,0.462882,0.488372,0.69,0.462963,0.483333
3,0.439024,0.360731,0.661972,0.353846,0.615385,0.23913,0.709804,0.6,0.566038,0.520833,...,0.542636,0.482759,0.445714,0.290488,0.510121,0.576419,0.465116,0.53,0.551852,0.45
4,0.365854,0.488584,0.429577,0.646154,0.304348,0.434783,0.392157,0.5,0.188679,0.375,...,0.767442,0.448276,0.565714,0.737789,0.91498,0.441048,0.674419,0.51,0.403704,0.488889


spam
with use of [repo](https://github.com/edumunozsala/Intro-NLP-Text-Classification/blob/master/Intro_NLP_1_TFIDF_Text_Classification.ipynb)

In [63]:
def read_spam(dir_path='.'):
    spam_train = pd.read_csv(dir_path+'/sms_train.csv', sep=',')
    spam_test_data = pd.read_csv(dir_path+'/sms_test.csv', sep=',')
    spam_train_data, spam_train_labels = spam_train['message'], spam_train['label']
    spam_train_X, spam_val_X, spam_train_y, spam_val_y = train_test_split(spam_train_data, spam_train_labels, test_size=0.2, random_state=42)

    tfidf = TfidfVectorizer()
    train = tfidf.fit_transform(spam_train_X)
    spam_train_X = pd.DataFrame(train.toarray(), columns=tfidf.get_feature_names())

    val = tfidf.transform(spam_val_X)
    spam_val_X = pd.DataFrame(val.toarray(), columns=tfidf.get_feature_names())

    test = tfidf.transform(spam_test_data['message'])
    spam_test_data = pd.DataFrame(test.toarray(), columns=tfidf.get_feature_names())

    minmax=MinMaxScaler()
    spam_train_X=pd.DataFrame(minmax.fit_transform(spam_train_X))
    spam_val_X=pd.DataFrame(minmax.transform(spam_val_X))
    spam_test_data=minmax.transform(spam_test_data)
    return spam_train_X, spam_train_y, spam_val_X, spam_val_y, spam_test_data

In [64]:
spam_train_X, spam_train_y, spam_val_X, spam_val_y, spam_test_data = read_spam()

## Feature selection methods
1. Boruta
2. Chi-squared
3. Recursive Freature Elimination

In [65]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import numpy as np

Boruta

In [71]:
def boruta_select_features(X, y):
    rf = RandomForestClassifier()
    rf.fit(X, y)
    features_selection = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
    np_y = np.array(y)
    np_y = np_y.reshape(np_y.shape[0],)
    np_X = np.array(X)
    features_selection.fit(np_X, np_y)
    features_imp_boruta_1=features_selection.support_
    features_imp_boruta_1=features_imp_boruta_1.astype(int)
    return features_imp_boruta_1

In [88]:
def change_data_with_selected_features(X, features, boruta=False):
    if boruta:
        features=np.where(features>0)
        features=features[0].tolist()
    X=X[features]
    return X

In [72]:
%%capture
art_train_X_features=boruta_select_features(art_train_X, art_train_y)

In [85]:
art_train_X_boruta2=change_data_with_selected_features(art_train_X, art_train_X_features, boruta=True)
art_val_X_boruta2=change_data_with_selected_features(art_val_X, art_train_X_features, boruta=True)
art_test_data_boruta2=change_data_with_selected_features(art_test_data, art_train_X_features, boruta=True)

In [73]:
%%capture
spam_train_X_features=boruta_select_features(spam_train_X, spam_train_y)

KeyboardInterrupt: 

In [None]:
spam_train_X_boruta2=change_data_with_selected_features(spam_train_X, spam_train_X_features)
spam_val_X_boruta2=change_data_with_selected_features(spam_val_X, spam_train_X_features)
spam_test_data_boruta2=change_data_with_selected_features(spam_test_data, spam_train_X_features)

Chi-squared

In [81]:
def chi2_select_features(X,y, num_feats):
    chi_selector = SelectKBest(chi2, k=num_feats)
    chi_selector.fit(X, y)
    chi_selected_features=chi_selector.get_support()
    chi_selected_features = X.loc[:,chi_selected_features].columns.tolist()
    return chi_selected_features

In [89]:
art_train_X_chi2_features=chi2_select_features(art_train_X, art_train_y, 10)
art_train_X_chi2=change_data_with_selected_features(art_train_X, art_train_X_chi2_features)
art_val_X_chi2=change_data_with_selected_features(art_val_X, art_train_X_chi2_features)
art_test_data_chi2=change_data_with_selected_features(art_test_data, art_train_X_chi2_features)

In [91]:
spam_train_X_chi2_features=chi2_select_features(spam_train_X, spam_train_y, 100)
spam_train_X_chi2=change_data_with_selected_features(spam_train_X, spam_train_X_chi2_features)
spam_val_X_chi2=change_data_with_selected_features(spam_val_X, spam_train_X_chi2_features)
spam_test_data_chi2=change_data_with_selected_features(spam_test_data, spam_train_X_chi2_features)

IndexError: index 1008 is out of bounds for axis 0 with size 1000

Recursive feature elimination

In [None]:
def rfe_select_features(X, y, num_feats):
    rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5)
    rfe_selector.fit(X, y)
    rfe_support = rfe_selector.get_support()
    rfe_feature = X.loc[:,rfe_support].columns.tolist()
    return rfe_feature

In [None]:
art_train_X_rfe_features=rfe_select_features(art_train_X, art_train_y, 10)
art_train_X_rfe=change_data_with_selected_features(art_train_X, art_train_X_rfe_features)
art_val_X_rfe=change_data_with_selected_features(art_val_X, art_train_X_rfe_features)
art_test_data_rfe=change_data_with_selected_features(art_test_data, art_train_X_rfe_features)

In [None]:
spam_train_X_rfe_features=rfe_select_features(spam_train_X, spam_train_y, 100)
spam_train_X_rfe=change_data_with_selected_features(spam_train_X, spam_train_X_rfe_features)
spam_val_X_rfe=change_data_with_selected_features(spam_val_X, spam_train_X_rfe_features)
spam_test_data_rfe=change_data_with_selected_features(spam_test_data, spam_train_X_rfe_features)