# Categorical Meta Feature
Build a meta feature out of the predict probability array from a randomforestclassifier, the output and save to csv

In [1]:
import pandas as pd

folder = 'bosch-production-line-performance/'

response = pd.read_csv(folder + 'train_numeric.csv', usecols = ['Response'], 
               squeeze = True)

cat_iter = pd.read_csv(folder + 'train_categorical.csv', dtype = str, 
                        iterator = True, chunksize = 1000)

In [2]:
print(response.shape)
response.head()

(1183747,)


0    0
1    0
2    0
3    0
4    0
Name: Response, dtype: int64

In [3]:
#Time to chunk in a training set
chunks = []

for i in range(300):
    chunks.append(cat_iter.get_chunk())
    
cat = pd.concat(chunks, ignore_index = True)

print(cat.shape)
cat.head()

(300000, 2141)


Unnamed: 0,Id,L0_S1_F25,L0_S1_F27,L0_S1_F29,L0_S1_F31,L0_S2_F33,L0_S2_F35,L0_S2_F37,L0_S2_F39,L0_S2_F41,...,L3_S49_F4225,L3_S49_F4227,L3_S49_F4229,L3_S49_F4230,L3_S49_F4232,L3_S49_F4234,L3_S49_F4235,L3_S49_F4237,L3_S49_F4239,L3_S49_F4240
0,4,,,,,,,,,,...,,,,,,,,,,
1,6,,,,,,,,,,...,,,,,,,,,,
2,7,,,,,,,,,,...,,,,,,,,,,
3,9,,,,,,,,,,...,,,,,,,,,,
4,11,,,,,,,,,,...,,,,,,,,,,


In [4]:
#we already did a test in the last notebook, so this one will will fit with all
#300 of our selected lines to get a slightly better model.
cat = cat.drop(columns = 'Id')

y = response[: cat.shape[0]]

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce

pipe = make_pipeline(
    SimpleImputer(strategy = 'most_frequent'),
    ce.OrdinalEncoder(),
    RandomForestClassifier(n_jobs = -1, random_state = 42)
)

pipe.fit(cat, y)

Pipeline(memory=None,
         steps=[('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='most_frequent',
                               verbose=0)),
                ('ordinalencoder',
                 OrdinalEncoder(cols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                      23, 24, 25, 26, 27, 28, 29, ...],
                                drop_invariant=False, handle_missing='value',
                                handle_unknown='value...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                       

In [6]:
#resetting our chunking iterator
cat_iter = pd.read_csv(folder + 'train_categorical.csv', dtype = str, 
                       iterator = True, chunksize = 1000)
meta_chunks = []

for chunk in cat_iter:

    proba = pipe.predict_proba(chunk.drop(columns = 'Id'))
    
    proba = pd.Series(proba[:,1])
    
    pred = pipe.predict(chunk.drop(columns = 'Id'))
    
    pred = pd.Series(pred)
    
    df = pd.DataFrame()
    
    df['cat_pred'] = pred
    
    df['cat_pred_proba'] = proba
    
    meta_chunks.append(df)
    
meta_feature = pd.concat(meta_chunks, ignore_index = True)

print(meta_feature.shape)
meta_feature.head()

(1183747, 2)


Unnamed: 0,cat_pred,cat_pred_proba
0,0,0.004714
1,0,0.004714
2,0,0.004714
3,0,0.004714
4,0,0.004714


In [7]:
meta_feature.to_csv('wrangled-sets/cat_meta.csv', index = False)