In [6]:
import os
import sys
import numpy as np
import pandas as pd  
from sklearn import tree 
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

# Easy dataset

In [7]:
categories = ('comp.windows.x', 'rec.sport.hockey')

easy_dataset = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

tf_vec = CountVectorizer (max_df=500, 
                          min_df=0, 
                          max_features =10000, 
                          ngram_range =(1,1), 
                          stop_words='english')

tf_matrix = tf_vec.fit_transform(easy_dataset.data[:500])  #sparse matrix

print ("the data has {} rows and {} columns ".format(tf_matrix.shape[0], tf_matrix.shape[1]))

full_matrix = pd.DataFrame(tf_matrix.todense(),columns=tf_vec.get_feature_names())

the data has 500 rows and 10000 columns 


In [24]:
# classify data
t=np.asarray(easy_dataset.target[:500])   # true labels

xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(), t, random_state=50) 

## Naiive Bayes

In [36]:
clf = NB()
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of {} points: {}".format(xtest.shape[0],error ))

number of mislabels out of 125 points: 61


## Decision tree

In [37]:
clf = tree.DecisionTreeClassifier()
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of {} points: {}".format(xtest.shape[0],error ))

number of mislabels out of 125 points: 62


# Difficult dataset

In [62]:
categories = ('rec.motorcycles', 'rec.autos')

difficult_dataset = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

tf_vec = CountVectorizer (max_df=500, 
                          min_df=0, 
                          max_features =10000, 
                          ngram_range =(1,1), 
                          stop_words='english')

tf_matrix = tf_vec.fit_transform(easy_dataset.data[:500])  #sparse matrix

print ("the data has {} rows and {} columns ".format(tf_matrix.shape[0], tf_matrix.shape[1]))

full_matrix = pd.DataFrame(tf_matrix.todense(),columns=tf_vec.get_feature_names())

the data has 500 rows and 10000 columns 


In [64]:
# classify data
t=np.asarray(difficult_dataset.target[:500])   # true labels

xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(), t, random_state=50) 

## Naiive Bayes

In [65]:
clf = NB()
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of {} points: {}".format(xtest.shape[0],error ))

number of mislabels out of 125 points: 62


# Decision tree

In [15]:
clf = tree.DecisionTreeClassifier()
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of {} points: {}".format(xtest.shape[0],error ))

number of mislabels out of 125 points: 68


# Results

From the classification results we can say that it's easier to classfy easy dataset then difficult dataset, as the naming of each dataset implied. Additionally, the Naiive Base classifer did better on this data set than a Decision Tree (but this appears to be purely due to chance, as occasionally the opposite is true...I guess I'm saying that they are fairly equivalent).

## Bagging

In [38]:
# Bagging
bagging = BaggingClassifier(tree.DecisionTreeClassifier())
y_pred = bagging.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of {} points: {}".format(xtest.shape[0],error ))

number of mislabels out of 125 points: 61


## AdaBoost

In [39]:
clf = AdaBoostClassifier(NB(), algorithm="SAMME", n_estimators=300)
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of {} points: {}".format(xtest.shape[0],error ))

number of mislabels out of 125 points: 62


## Random forest

In [40]:
clf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=10, max_features='auto')
y_pred = clf.fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of {} points: {}".format(xtest.shape[0],error ))

number of mislabels out of 125 points: 63


# Results

From the above results we can say that Bagging, AdaBoost, Random forest models have pretty close to the same score as Naiive Bayes and Decision tree models; with NB doing marginally better. 

# Multi-label classification
---

In [68]:
categories = ('comp.graphics', 'rec.autos', 'talk.politics.guns', 'soc.religion.christian')

dataset = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

tf_vec = CountVectorizer (max_df=500, 
                          min_df=0, 
                          max_features =30000, 
                          ngram_range =(1,1), 
                          stop_words='english')

tf_matrix = tf_vec.fit_transform(dataset.data)  #sparse matrix

print ("the data has {} rows and {} columns ".format(tf_matrix.shape[0], tf_matrix.shape[1]))

full_matrix = pd.DataFrame(tf_matrix.todense(),columns=tf_vec.get_feature_names())

the data has 2323 rows and 29847 columns 


In [80]:
# classify data
t=np.asarray(dataset.target)   # true labels

xtrain, xtest, ytrain, ytest = train_test_split(full_matrix.as_matrix(), t, random_state=50) 

## Naiive Bayes

In [81]:
clf = NB()

y_pred = OneVsRestClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of {} points: {}".format(xtest.shape[0],error ))

number of mislabels out of 581 points: 125


## Decision tree

In [84]:
clf =  tree.DecisionTreeClassifier()

y_pred = OneVsRestClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of {} points: {}".format(xtest.shape[0],error ))

number of mislabels out of 581 points: 126


## SVC

In [82]:
clf =  LinearSVC()

y_pred = OneVsRestClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of {} points: {}".format(xtest.shape[0],error ))

number of mislabels out of 581 points: 92


# All-vs-all classification

## Naiive Bayes

In [83]:
clf = NB()

y_pred = OneVsOneClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of {} points: {}".format(xtest.shape[0],error ))

number of mislabels out of 581 points: 71


## Decision tree

In [78]:
clf =  tree.DecisionTreeClassifier()

y_pred = OneVsOneClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of {} points: {}".format(xtest.shape[0],error ))

number of mislabels out of 581 points: 123


## SVC

In [79]:
clf =  LinearSVC()

y_pred = OneVsOneClassifier(clf).fit(xtrain, ytrain).predict(xtest)
error = (y_pred != ytest).sum()
print ("number of mislabels out of {} points: {}".format(xtest.shape[0],error ))

number of mislabels out of 581 points: 100


## Performance results
1. one vs one Naiive Bayes
2. one vs all SVC
3. all vs all SVC
4. one vs one Decision tree
5. one vs one Naiive Bayes
6. all vs all Decision tree

# Question 3
How does the classifier process the weights of the data points to focus on misclassified data points?

Given (x1,y1)….(xn,yn) and initial weights of data points wi = 1/n, i=1…n

Weights are updated according to the formula: wi = wi*exp⁡(cm.1(y≠fm(x)), i=1…n

but don't take my word for it, that's just what hours of combing through stack overflow told me. 

my guess in human/low level of conceptual understanding is: the process uses an error calulation between predicted and observed values based on the test data, this weight is then used to increase attention to those points...but honestly, I would just trust the stack overflow version. Those people really seem to know what they're talking about. 
