### Bootstrap

#### Packages

In [12]:
import pandas as pkg_pandas
from sklearn import linear_model as pkg_linear_model
from sklearn import model_selection as pkg_model_selection
from sklearn import preprocessing as pkg_preprocessing
from sklearn import tree as pkg_tree

#### Load Data

In [13]:
loaded_df = pkg_pandas.read_csv("../data/cbex-dstr-employee-salaries.csv")
loaded_df

Unnamed: 0,company,job,degree,salary_gt_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0
5,google,computer programmer,masters,1
6,google,customer support,bachelors,0
7,google,customer support,masters,0
8,abc pharma,sales executive,masters,0
9,abc pharma,computer programmer,bachelors,0


### Preprocessing

#### Pre-Work

In [14]:
def label_encode_field(dataset_df, field_name):
    encoded_field_index = dataset_df.columns.get_loc(field_name) + 1
    encoded_field_name = '_'.join([field_name, 'number'])
    encoded_field_values = pkg_preprocessing.LabelEncoder().fit_transform(dataset_df[field_name])
    dataset_df.insert(loc=encoded_field_index, column=encoded_field_name, value=encoded_field_values, allow_duplicates=True)
    dataset_df = dataset_df.drop(columns=[field_name])
    return dataset_df

#### Encode Columns with non-numerical values 

In [15]:
# Initialize
encoded_df = loaded_df
encoded_df.head(5)

Unnamed: 0,company,job,degree,salary_gt_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0


In [16]:
encoded_df = label_encode_field(encoded_df, 'company')
encoded_df = label_encode_field(encoded_df, 'job')
encoded_df = label_encode_field(encoded_df, 'degree')
encoded_df.head(5)

Unnamed: 0,company_number,job_number,degree_number,salary_gt_100k
0,2,3,0,0
1,2,3,1,0
2,2,0,0,1
3,2,0,1,1
4,2,1,0,0


### Analysis

#### Pre-Work

In [17]:
# Initialize
baseline_df  = encoded_df
baseline_df.head(5)

Unnamed: 0,company_number,job_number,degree_number,salary_gt_100k
0,2,3,0,0
1,2,3,1,0
2,2,0,0,1
3,2,0,1,1
4,2,1,0,0


In [18]:
output_column_name = 'salary_gt_100k'
baseline_outputs = baseline_df[output_column_name]
baseline_inputs = baseline_df.drop(columns=[output_column_name])

train_inputs, test_inputs, train_outputs, test_outputs  = \
    pkg_model_selection.train_test_split(baseline_inputs, baseline_outputs, test_size=0.20)

print("=== Baseline Split - Train and Test ===")
print("Lengths: Baseline = {}, Train = {}, Test = {}".format(len(baseline_inputs), len(train_inputs), len(test_inputs)))

=== Baseline Split - Train and Test ===
Lengths: Baseline = 29, Train = 23, Test = 6


#### Model: Train (Fit)

In [19]:
model = pkg_tree.DecisionTreeClassifier()
model.fit(X=train_inputs, y=train_outputs)
model

In [20]:
baseline_score = model.score(baseline_inputs, baseline_outputs)
train_score = model.score(train_inputs, train_outputs)
test_score = model.score(test_inputs, test_outputs)
print("Score: Baseline = {}, Train = {}, Test = {}".format(baseline_score, train_score, test_score))

Score: Baseline = 0.9310344827586207, Train = 1.0, Test = 0.6666666666666666


#### Model: Test (Predict)

In [21]:
predicted_outputs = model.predict(test_inputs)
print("= = : Test Results : = =")
print("Predicted = {}\nActual    = {}".format(predicted_outputs, test_outputs.values))

= = : Test Results : = =
Predicted = [0 1 1 1 0 1]
Actual    = [1 1 0 1 0 1]


In [22]:
for i in range(3):
    predicted = model.predict([[i, i, i]])
    print("Test:: Input = {}, Predicted = {}".format([i, i, i], predicted))

Test:: Input = [0, 0, 0], Predicted = [1]
Test:: Input = [1, 1, 1], Predicted = [1]
Test:: Input = [2, 2, 2], Predicted = [0]


