Load Iris database and split training/test set

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
X=iris.data
y=iris.target
X_train, y_train, X_test, y_test = X[:120], y[:120], X[120:], y[120:]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((120, 4), (120,), (30, 4), (30,))

Training a decision tree model

In [2]:
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=2)

Visiualizing the decision tree (I was having trouble converting the graph into a png)

In [3]:
from sklearn.tree import export_graphviz
export_graphviz(tree_clf, 
                out_file='../hw6/iris_tree.dot', 
                feature_names=iris.feature_names[0:],
                class_names=iris.target_names,
                rounded=True,
                filled=True)

Load student scores database and preparing the data

In [7]:
from zlib import crc32
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

dataset = pd.read_csv('../hw6/hw2_student_scores.csv')
def test_set_check(identifier, test_ratio=0.2):
    total_size = 2**32
    hex_repr = crc32(np.int64(identifier)) & 0xffffffff
    in_test = hex_repr < (test_ratio * total_size)
    return in_test
dataset_with_id = dataset.reset_index()
def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]
train_set, test_set = split_train_test_by_id(data=dataset_with_id, test_ratio=0.2, id_column="index")
train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)
dataset['scores_cat'] = pd.cut(x=dataset['Scores'], bins=[0, 1.5, 3, 4.5, 6, np.inf], labels=[1, 2, 3, 4, 5])
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X=dataset, y=dataset['scores_cat']):
    strat_train_set = dataset.loc[train_index]
    strat_test_set = dataset.loc[test_index]
dataset = strat_train_set.drop("Scores", axis=1)
dataset_labels = strat_train_set['Scores'].copy()
for set_ in (strat_train_set, strat_test_set):
    set_.drop('scores_cat', axis=1, inplace=True)  
dataset_num = dataset.drop("Gender", axis=1)
imputer = SimpleImputer(strategy='median')
imputer.fit(dataset_num)
X = imputer.transform(dataset_num)
dataset_tr = pd.DataFrame(data=X, index=dataset_num.index, columns=dataset_num.columns)
dataset_cat = dataset[['Gender']]
ordinal_encoder = OrdinalEncoder()

dataset_cat_encoded = ordinal_encoder.fit_transform(dataset_cat.values)
one_hot_encoder = OneHotEncoder()
dataset_cat_1hot = one_hot_encoder.fit_transform(dataset_cat.values)

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

num_attribs = dataset_num.columns.tolist()
cat_attribs = ["Gender"]
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])
dataset_prepared = full_pipeline.fit_transform(dataset)
dataset_prepared.shape, dataset_labels.shape
X_train, X_val, y_train, y_val = train_test_split(dataset_prepared, dataset_labels)


Training a hard voting ensemble learner model

In [12]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

log_clf = LogisticRegression(solver='lbfgs')
rf_clf = RandomForestClassifier(n_estimators=100)
svm_clf = SVC(gamma='scale')
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rf_clf), ('svc', svm_clf)], voting='hard')
voting_clf.fit(dataset_prepared, dataset_labels)


VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

Evaluating the model (I'm not sure what mistakes I made here but the evaluations don't look correct)

In [14]:
for clf in [log_clf, rf_clf, svm_clf, voting_clf]:
    clf.fit(dataset_prepared, dataset_labels)
    y_hat = clf.predict(X_val)
    print(clf.__class__.__name__, accuracy_score(y_val, y_hat))

LogisticRegression 0.0
RandomForestClassifier 0.8
SVC 0.0
VotingClassifier 0.0


Training a bagging ensemble learner model

In [15]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
X_train, X_val, y_train, y_val = train_test_split(dataset_prepared, dataset_labels)
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=10, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=10,
                  n_estimators=500, n_jobs=-1)

Evaluating the model

In [16]:
y_hat = bag_clf.predict(X_val)
y_hat

array([30, 30, 30, 75, 21], dtype=int64)

Training a gradient boost ensemble learner model

In [17]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
X_train, X_val, y_train, y_val = train_test_split(dataset_prepared, dataset_labels)
gbrt.fit(X_train, y_train)


GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)