In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import time


# In[432]:


debug = False


# ### Load Data

# In[473]:


col_file = open('/home/jjohn273/git/ECBDL14-Classification/data/columns.csv', 'r')
columns = col_file.read().strip().split(',')
columns = [col if col != 'class' else 'target' for col in columns]


In [7]:
%ls /home/jjohn273/git/ECBDL14-Classification/data

columns.csv                ecbdl14-test.arff.gz   ecbdl14-train.csv.gz
ecbdl14-250k.csv           ecbdl14-test.csv.gz    ecbdl14-train-sample.csv.gz
ecbdl14.onehot.sample.hdf  ecbdl14-train.arff.gz


In [8]:
df = pd.read_csv('/home/jjohn273/git/ECBDL14-Classification/data/ecbdl14-train-sample.csv.gz', header=None, low_memory=False)
df.columns = columns
print(f'Loaded ecbdl14 data with shape {df.shape}')

Loaded ecbdl14 data with shape (3500000, 632)


In [9]:
print(f'Are there missing values ? {df.isna().any().any()}')

Are there missing values ? False


In [10]:
print('One hot encoding categorical variables')
df = pd.get_dummies(df)

One hot encoding categorical variables


In [11]:
from sklearn.model_selection import train_test_split


# In[481]:

print('Making train/test split')
train, test = train_test_split(df, test_size=0.2, random_state=42)
print(f'Train shape {train.shape}', f'Test shape {test.shape}')

Making train/test split
Train shape (2800000, 806) Test shape (700000, 806)


In [12]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold, chi2, SelectKBest
from sklearn.pipeline import Pipeline


train_y, train_x = train.loc[:, ['target']], train.drop(columns=['target'])
test_y, test_x = test.loc[:, ['target']], test.drop(columns=['target'])


In [13]:
print('Beginning feature selection')
start = time.time()

feature_selector = SelectKBest(chi2, k=200)

pipeline = Pipeline([
  ('normalize', MinMaxScaler()),
  ('strip_zero_variance', VarianceThreshold()),
  ('feature_selector', feature_selector)])

train_x_normalized = pipeline.fit_transform(train_x, train_y)
test_x_normalized = pipeline.transform(test_x)

columns = train_x.columns[feature_selector.get_support()]

train_x_normalized = pd.DataFrame(train_x_normalized, columns=columns, index=train_x.index)
test_x_normalized = pd.DataFrame(test_x_normalized, columns=columns, index=test_x.index)

train_normalized = pd.concat([train_x_normalized, train_y], axis=1)
test_normalized = pd.concat([test_x_normalized, test_y], axis=1)

end = time.time()

Beginning feature selection


In [14]:
print(f'Feature selection completed in {end - start}')

Feature selection completed in 83.05122184753418
