## Feature Extraction Notebook

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.decomposition import PCA

### Feature Extraction

In [2]:
path = './ClassifyTripType/'
tr_path = path + 'train.csv'
te_path = path + 'test.csv'

In [3]:
def feature_extract(path):
    df = pd.read_csv(path, dtype={'Upc': object})
    
    # Deal with missing data
    nullDD = df['DepartmentDescription'].isnull()
    df.loc[nullDD, 'FinelineNumber'] = -2
    df.loc[nullDD, 'DepartmentDescription'] = 'UNKNOWN'
    df['FinelineNumber'].fillna(-1, inplace=True)
    df.loc[df.DepartmentDescription=='MENSWEAR', 'DepartmentDescription'] = 'MENS WEAR'
    
    # Extract features
    
    # grouped features
    grouped = df.groupby('VisitNumber')
    RI = grouped.ScanCount.apply(lambda x: 1 if any(x < 0) else 0)  # whether a product is returned
    RI.name = 'ReturnIndicator'
    NumDD = grouped.DepartmentDescription.apply(lambda x: len(x.unique()))  # Number of DD count
    NumDD.name = 'CountDD'
    NumUPC = grouped.Upc.apply(lambda x: len(x.unique()))  # kinds of UPC count
    NumUPC.name = 'CountUPC'
    
    # weekday
    WD = df[['VisitNumber', 'Weekday']].drop_duplicates(subset='VisitNumber')
    WD = pd.get_dummies(WD)
    WD.columns = ['VisitNumber', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']
    WD.set_index('VisitNumber', inplace=True)
    
    # DD_sum
    DD = df.pivot_table('ScanCount', ['VisitNumber'], ['DepartmentDescription'], aggfunc=np.sum).fillna(0)
    
    # FN_sum
    FN = df.pivot_table('ScanCount', ['VisitNumber'], ['FinelineNumber'], aggfunc=np.sum).fillna(0)
    FN.columns = ['FN' + str(x) for x in np.array(FN.columns).astype(int)]
    
    Feature = pd.concat([RI, NumDD, NumUPC, WD, DD, FN], axis=1, keys=['RI', 'NumDD', 'NumUPC', 'WD', 'DD', 'FN'])
    
    del grouped, RI, NumDD, NumUPC, WD, DD, FN
    
    if 'TripType' in df.columns:
        # Extract training set labels
        trainLabel = df[['VisitNumber', 'TripType']].drop_duplicates().set_index('VisitNumber')
        return Feature, trainLabel
    else:
        return Feature

In [4]:
trainFeature, trainLabel = feature_extract(tr_path)
print('training set done')
testFeature = feature_extract(te_path)
print('test set done')

training set done
test set done


In [5]:
allFeature = pd.concat([trainFeature, testFeature], keys = ['train', 'test']).fillna(0)
del trainFeature, testFeature
print('concat done')

concat done


In [10]:
allFeature

Unnamed: 0_level_0,Unnamed: 1_level_0,DD,DD,DD,DD,DD,DD,DD,DD,DD,DD,...,NumDD,NumUPC,RI,WD,WD,WD,WD,WD,WD,WD
Unnamed: 0_level_1,DepartmentDescription,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,BRAS & SHAPEWEAR,...,CountDD,CountUPC,ReturnIndicator,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
Unnamed: 0_level_2,VisitNumber,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
train,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,1,1,0,0,0,0,0,0
train,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2,2,0,1,0,0,0,0,0,0
train,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7,21,1,1,0,0,0,0,0,0
train,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2,3,0,1,0,0,0,0,0,0
train,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2,3,0,1,0,0,0,0,0,0
train,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,4,0,1,0,0,0,0,0,0
train,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,4,7,0,1,0,0,0,0,0,0
train,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,8,0,1,0,0,0,0,0,0
train,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2,4,0,1,0,0,0,0,0,0
train,19,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6,8,0,1,0,0,0,0,0,0


In [8]:
allFeature['FN']

Unnamed: 0_level_0,DepartmentDescription,FN-1,FN-2,FN0,FN1,FN10,FN100,FN1000,FN1001,FN1002,FN1003,...,FN9969,FN9970,FN9971,FN9974,FN9975,FN998,FN9991,FN9997,FN9998,FN9999
Unnamed: 0_level_1,VisitNumber,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
train,5,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
allFeature['DD']

Unnamed: 0_level_0,DepartmentDescription,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,BRAS & SHAPEWEAR,...,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,UNKNOWN,WIRELESS
Unnamed: 0_level_1,VisitNumber,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
train,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
train,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
train,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
train,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,19,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Save allFeature

In [9]:
def save_feature(allFeature, savePath):
    train_X = sp.sparse.coo_matrix(allFeature.loc['train'].values).tocsr()
    train_y = trainLabel.values.reshape((-1,))
    train_VN = np.array(allFeature.loc['train'].index)
    
    test_X = sp.sparse.coo_matrix(allFeature.loc['test'].values).tocsr()
    test_VN = np.array(allFeature.loc['test'].index)
    
    sp.sparse.save_npz(savePath + 'train_X.npz', train_X)
    np.save(savePath + 'train_y.npy', train_y)
    sp.sparse.save_npz(savePath + 'test_X.npz', test_X)
    np.save(savePath + 'train_VN.npy', train_VN)
    np.save(savePath + 'test_VN.npy', test_VN)

In [10]:
savePath = './origin/'
save_feature(allFeature, savePath)

### Drop Some FLN

In [11]:
k = allFeature.FN.apply(lambda x: x.sum())
dropedFN = [('FN',x) for x in k[k<200].index]

In [12]:
allFeature.drop(dropedFN, axis=1, inplace=True)

In [14]:
allFeature['FN']

Unnamed: 0_level_0,DepartmentDescription,FN-1,FN-2,FN0,FN1,FN10,FN100,FN1000,FN1001,FN1002,FN1003,...,FN97,FN9704,FN9705,FN9720,FN9803,FN988,FN9912,FN9920,FN9926,FN9998
Unnamed: 0_level_1,VisitNumber,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
train,5,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
train,19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
savePath = './droped/'
save_feature(allFeature, savePath)

### PCA for FinlineNumber (given up)

In [12]:
FN_PCA = 200

if FN_PCA is not None:
    ReducedFN = PCA(n_components=FN_PCA).fit_transform(allFeature['FN'].values)
    allFeature.drop('FN', axis=1, inplace=True)
    multi = pd.MultiIndex.from_tuples([('ReducedFN', 'RFN' + str(x)) for x in range(FN_PCA)])
    ReducedFN = pd.DataFrame(ReducedFN, index=allFeature.index, columns=multi)
    allFeature = pd.concat([allFeature, ReducedFN], axis=1)

In [16]:
savePath = './reduced/'
save_feature(allFeature, savePath)