In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

### Format inputs -- cells will export formatted data to ```/data/``` so only need to do once. Skip <a href='#here'>here</a> if already done.

In [2]:
scaled = False

In [3]:
df = pd.read_excel("../data/Clean_Data.xlsx")

df.columns = [name.strip() for name in df.columns]
df['PROD CAT 4 NAME'].fillna("Nonbranded")
df = df.replace('-', 0);

In [4]:
preset_categories = {
    'core item flag': ['N', 'Y'],
    # 'segment': ['Packaging', 'Facility Solutions'],
    'stocking flag': ['N', 'Y'],
    'national acct flag': ['N', 'Y'],
    'PROD CAT 4 NAME': ['Nonbranded', 'Branded'],
    'saalfeld customer flag': ['N', 'Y'],
}

In [5]:
if scaled:
    derived_categories = [
        'qty 6mos',
        'cogs 6mos',
        'Margin %',
        'picks 6mos',
        'net OH',
        'net OH $',
        'DIOH'
    ]

    scaler = MinMaxScaler()

    out = scaler.fit_transform(df[derived_categories])
    df2 = pd.DataFrame(out, columns=derived_categories)

    for column in df2.columns:
        df[column] = df2[column]

In [6]:
for field in preset_categories:
    options = preset_categories[field]
    buckets = list(range(len(options)))
    
    for idx in range(len(options)):
        df[field] = df[field].replace(options[idx], buckets[idx])

In [7]:
df.to_excel('../data/formatted.xlsx')

### Read formatted spreadsheet, only use columns without text or unique IDs <a id='here'></a>

In [8]:
df = pd.read_excel("../data/formatted.xlsx") 

In [9]:
use_cols = ['stocking flag',
       'national acct flag',
        'Margin %', 'qty 6mos', 'cogs 6mos', 'picks 6mos'] 
# ,'net OH','net OH $', 'DIOH'] 
# 'sales channel', 'PROD CAT 4 NAME',

In [10]:
target = df['core item flag'].astype('float32')
data = df[use_cols].astype('float32');

### Create model object ```clf``` 

In [11]:
clf = RandomForestClassifier(max_depth=2, random_state=0);

In [12]:
clf.fit(data, target);

The feature importances vector shows the relative significance of each attribute used in the model.

In [13]:
zipped = sorted(tuple(zip(use_cols, clf.feature_importances_)), key=lambda x: x[1])
zipped

[('national acct flag', 0.01596468405282644),
 ('picks 6mos', 0.022389240842908727),
 ('Margin %', 0.08983060278566918),
 ('cogs 6mos', 0.11097187278625781),
 ('stocking flag', 0.3675029696484314),
 ('qty 6mos', 0.39334062988390645)]

```Score``` is % of our "core" items that Veritiv considers core.

In [14]:
clf.score(data, target)

0.6282289969746334