### Install dependencies

In [None]:
# same deal for gdrive and kaggle
from google.colab import drive
drive.mount('/content/drive')

!rm -r ~/.kaggle
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/.kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!pip install -q kaggle


Mounted at /content/drive
rm: cannot remove '/root/.kaggle': No such file or directory


In [None]:
# install dataset and unzip

!rm -r dataset

!kaggle datasets download -d saumitgp/occupancy-detection-dataset
!mkdir dataset
!unzip occupancy-detection-dataset.zip -d dataset

rm: cannot remove 'dataset': No such file or directory
Downloading occupancy-detection-dataset.zip to /content
  0% 0.00/213k [00:00<?, ?B/s]
100% 213k/213k [00:00<00:00, 51.3MB/s]
Archive:  occupancy-detection-dataset.zip
  inflating: dataset/OccupancyData/DataTest.csv  
  inflating: dataset/OccupancyData/DataTraining.csv  


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# the balance is weird, I will merge the train and test files first and split them to 8:2
df = pd.concat([
        pd.read_csv('dataset/OccupancyData/DataTraining.csv',index_col=[0]),
        pd.read_csv('dataset/OccupancyData/DataTest.csv',index_col=[0])
    ])
raw_vals = df.drop(columns=['date']).values
X,y = raw_vals[:,:-1],raw_vals[:,-1]
# pad X with 1 in the beginning, this will serve as our bias term
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=0)

# decision trees don't need scaling! :)

### Task 1

Implement a Decision Tree Classifier for your classification problem. You may use a built-in package to implement your classifier. Try modifying one or more of the input parameters and describe what changes you notice in your results. Clearly describe how these factors are affecting your output.


I tried modifying some leaf conditions, complexity and depth of the tree. However, all the trees returned very simlar performance. I believe this was due to the dataset being too learnable.

In [None]:
from sklearn.tree import DecisionTreeClassifier

results = []
# maximum depth of the tree
for d in [2,3,4,5,6]:
    for l in [1,.2,.1]:
    # ccp_alpha, this is basically the complexity threshold
        for alpha in [0,.1,.3]:
            params = {
                'random_state': 0,
                'max_depth': d,
                'ccp_alpha': alpha,
                'min_samples_leaf':l
            }
            tree = DecisionTreeClassifier(**params)

            tree.fit(X_train,y_train)

            results.append({
                **params,
                'test_score': tree.score(X_test,y_test),
                'train_score':tree.score(X_train,y_train),
            })

results = pd.DataFrame(results)

results

Unnamed: 0,random_state,max_depth,ccp_alpha,min_samples_leaf,test_score,train_score
0,0,2,0.0,1.0,0.987427,0.992526
1,0,2,0.1,1.0,0.987147,0.991897
2,0,2,0.3,1.0,0.987147,0.991897
3,0,2,0.0,0.2,0.987147,0.991897
4,0,2,0.1,0.2,0.987147,0.991897
5,0,2,0.3,0.2,0.987147,0.991897
6,0,2,0.0,0.1,0.987147,0.991897
7,0,2,0.1,0.1,0.987147,0.991897
8,0,2,0.3,0.1,0.987147,0.991897
9,0,3,0.0,1.0,0.987427,0.992596


### Task 2

From the Bagging and Boosting ensemble methods pick any one algorithm 
from each category. Implement both the algorithms using the same data. Use k-fold cross 
validation to find the effectiveness of both the models. Comment on the difference/similarity of 
the results.

In [104]:
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

def k_fold_validation(clf,X_train,y_train,n_folds=10,eval='f1'):
    idxs = np.arange(X_train.shape[0])
    np.random.shuffle(idxs)
    folds = np.array_split(idxs,n_folds)
    all_scos = []
    for i in tqdm(range(n_folds),desc=f'{n_folds} folds validation...'):
        holdout = folds[i]
        rest = np.hstack([f for j,f in enumerate(folds) if not j == i])
        clf.fit(X_train[rest],y_train[rest])
        if eval == 'accuracy':
            sco = clf.score(X_train[holdout],y_train[holdout])
        elif eval == 'f1':
            sco = f1_score(y_train[holdout],clf.predict(X_train[holdout]))
        all_scos.append(sco)
    return all_scos  


In [108]:
# bagging
# n_bags = 50
class BaggedTree:
    def __init__(self,n_bags):
        self.n_bags = n_bags
        self.bagged_models = []
    def fit(self,X_train,y_train):
        idxs = np.arange(X_train.shape[0])
        for _ in range(self.n_bags):
            subset = np.random.choice(idxs,X_train.shape[0] // self.n_bags )
            X_sub,y_sub = X_train[subset],y_train[subset]
            self.bagged_models.append(DecisionTreeClassifier().fit(X_sub,y_sub))
        # now testing the mdoel
        return self
    def predict(self,X_test):
        # in this case, since the trees predict between 0 and 1, we split from .5
        return (np.vstack([m.predict(X_test) for m in self.bagged_models]).mean(0) > .5).astype(int)
        # accuracy = (preds == y_test).mean()
    def score(self,X_test,y_test):
        return (self.predict(X_test) == y_test).mean()


In [117]:
# Adaboost implementation
# this one takes a bit longer

class AdaBoost:
    def __init__(self,n_models):
        self.n_models = n_models
        self.boosted_models = []
        self.alphas = []
    def fit(self,X_train,y_train,n_iter=100):
        weights = np.ones_like(y_train) / y_train.shape[0]

        for _ in tqdm(range(n_iter),desc='AdaBoost training...'):
            for _ in range(self.n_models):
                m = DecisionTreeClassifier(max_depth=1).fit(X_train,y_train,sample_weight=weights)
                y_pred = m.predict(X_train)
                # identify missed examples
                misses = (y_train != y_pred).astype(int)
                # calculate overall err
                err = (weights * misses).sum() / weights.sum()
                # reweight
                alpha = np.log((1-err)/err)
                weights *= np.exp(alpha * misses)
                self.alphas.append(alpha)
                self.boosted_models.append(m)
                # now the models should be good to go.. 
        return self
    def predict(self,X_test):
        return (
            # sin to limit bounds to -1,1 and then convert below 0 to 0. 
            np.sin(
                # stack each prediction and sum for each example
                np.vstack([
                    a*m.predict(X_test) 
                    for a,m in zip(self.alphas,self.boosted_models)
                ]).sum(0)
            ) > 0
        ).astype(int)
    def score(self,X_test,y_test):
        return (self.predict(X_test) == y_test).mean()
 


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig,axes = plt.subplots(nrows=2, ncols = 2,figsize =(12,12))

bagged_acc_avgs = [
    np.mean(k_fold_validation(BaggedTree(50),X_train,y_train,eval='accuracy'))
    for n in [10,20,50,100,200]
]

print(bagged_acc_avgs)
# bagging is doing a bit too well and we can't tell. so setting the lims between .98 and 1
sns.lineplot(y=bagged_acc_avgs,x=[10,20,50,100,200], ax=axes[0,0]).set(ylim=(.98,1),title='Bagged Trees accuracy')

ada_acc_avgs = [
    np.mean(k_fold_validation(AdaBoost(50),X_train,y_train,eval='accuracy'))
    for n in [10,20,50,100,200]
]

print(ada_acc_avgs)
sns.lineplot(y=ada_acc_avgs,x=[10,20,50,100,200], ax=axes[0,1]).set(ylim=(.5,1),title='AdaBoost trees accuracy')


bagged_f1_avgs = [
    np.mean(k_fold_validation(BaggedTree(50),X_train,y_train,eval='f1'))
    for n in [10,20,50,100,200]
]

print(bagged_f1_avgs)
sns.lineplot(y=bagged_acc_avgs,x=[10,20,50,100,200], ax=axes[1,0]).set(ylim=(.98,1),title='Bagged Trees F1')

ada_f1_avgs = [
    np.mean(k_fold_validation(AdaBoost(50),X_train,y_train,eval='f1'))
    for n in [10,20,50,100,200]
]

print(ada_f1_avgs)
sns.lineplot(y=ada_acc_avgs,x=[10,20,50,100,200], ax=axes[1,1]).set(ylim=(.5,1),title='AdaBoost trees F1')

10 folds validation...:   0%|          | 0/10 [00:00<?, ?it/s]

10 folds validation...:   0%|          | 0/10 [00:00<?, ?it/s]

10 folds validation...:   0%|          | 0/10 [00:00<?, ?it/s]

10 folds validation...:   0%|          | 0/10 [00:00<?, ?it/s]

10 folds validation...:   0%|          | 0/10 [00:00<?, ?it/s]

[0.9918269249538355, 0.9918270713530015, 0.9918273641513338, 0.991827656949666, 0.9918270713530017]


10 folds validation...:   0%|          | 0/10 [00:00<?, ?it/s]

AdaBoost training...:   0%|          | 0/100 [00:00<?, ?it/s]

In [114]:
print(bagged_acc_avgs)


[0.9918271689524456, 0.9918272665518899, 0.9918270225532796, 0.9918275593502219, 0.9917569949521567]


### Task 3

Compare the effectiveness of the three models implemented above. Clearly 
describe the metric you are using for comparison. Describe (with examples) Why is this 
metric(metrics) suited/appropriate for the problem at hand? How would a choice of a different 
metric impact your results? Can you demonstrate that.

