In [5]:
'''
Question 1 Skeleton Code


'''

import sklearn
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk.corpus import stopwords
import string
from sklearn.model_selection import GridSearchCV

def load_data():
    # import and filter data
    newsgroups_train = fetch_20newsgroups(subset='train',remove=('headers', 'footers', 'quotes'))
    newsgroups_test = fetch_20newsgroups(subset='test',remove=('headers', 'footers', 'quotes'))

    return newsgroups_train, newsgroups_test

# remove punctuation, stop words, and stemmize the words
def preprocess(data):
    stemmer = nltk.PorterStemmer()
    stop = set(list(string.punctuation) + list(stopwords.words('english')))
    word_tokens = [[token for token in nltk.word_tokenize(sentence) if token not in stop] for sentence in data]
    stemmed = [[stemmer.stem(token) for token in tokens] for tokens in word_tokens]
    return [' '.join(tokens) for tokens in stemmed]

def bow_features(train_data, test_data):
    # Bag-of-words representation
    bow_vectorize = CountVectorizer()
    bow_train = bow_vectorize.fit_transform(train_data.data) #bag-of-word features for training data
    bow_test = bow_vectorize.transform(test_data.data)
    feature_names = bow_vectorize.get_feature_names() #converts feature index to the word it represents.
    shape = bow_train.shape
    print('{} train data points.'.format(shape[0]))
    print('{} feature dimension.'.format(shape[1]))
    print('Most common word in training set is "{}"'.format(feature_names[bow_train.sum(axis=0).argmax()]))
    return bow_train, bow_test, feature_names

def tf_idf_features(train_data, test_data):
    # Bag-of-words representation
    tf_idf_vectorize = TfidfVectorizer()
    tf_idf_train = tf_idf_vectorize.fit_transform(train_data.data) #bag-of-word features for training data
    feature_names = tf_idf_vectorize.get_feature_names() #converts feature index to the word it represents.
    tf_idf_test = tf_idf_vectorize.transform(test_data.data)
    return tf_idf_train, tf_idf_test, feature_names

In [6]:
train_data, test_data = load_data()
train_data.data = preprocess(train_data.data)
test_data.data = preprocess(test_data.data) 
train_bow, test_bow, feature_names = bow_features(train_data, test_data)
train_tf_idf, test_tf_idf, feature_names = tf_idf_features(train_data, test_data)

11314 train data points.
86849 feature dimension.
Most common word in training set is "ax"


In [7]:
from sklearn.naive_bayes import BernoulliNB
def bnb_baseline(bow_train, train_labels, bow_test, test_labels):
    # training the baseline model
    binary_train = (bow_train>0).astype(int)
    binary_test = (bow_test>0).astype(int)

    model = BernoulliNB()
    model.fit(binary_train, train_labels)

    #evaluate the baseline model
    train_pred = model.predict(binary_train)
    print('BernoulliNB baseline train accuracy = {}'.format((train_pred == train_labels).mean()))
    test_pred = model.predict(binary_test)
    print('BernoulliNB baseline test accuracy = {}'.format((test_pred == test_labels).mean()))
    return model

bnb_model = bnb_baseline(train_bow, train_data.target, test_bow, test_data.target)

BernoulliNB baseline train accuracy = 0.5981085380943963
BernoulliNB baseline test accuracy = 0.46587891662241104


In [15]:
from sklearn.ensemble import RandomForestClassifier
def rf_baseline(tf_idf_train, train_labels, tf_idf_test, test_labels):
    search_grid = {'n_estimators': [400, 800, 1200]}
    model = RandomForestClassifier(n_jobs = 8)
    grid_clf = GridSearchCV(model, param_grid = search_grid)
    grid_clf.fit(tf_idf_train, train_labels)

    #evaluate the baseline model
    train_pred = grid_clf.predict(tf_idf_train)
    print('RandomForest baseline train accuracy = {}'.format((train_pred == train_labels).mean()))
    test_pred = grid_clf.predict(tf_idf_test)
    print('RandomForest baseline test accuracy = {}'.format((test_pred == test_labels).mean()))
    return grid_clf.best_estimator_

rf_model = rf_baseline(train_tf_idf, train_data.target, test_tf_idf, test_data.target)

RandomForest baseline train accuracy = 0.9742796535266042
RandomForest baseline test accuracy = 0.6411311736590547


In [13]:
from sklearn.svm import SVC
def svm_baseline(train, train_labels, test, test_labels):
    search_grid = {'kernel': ['linear', 'rbf', 'poly'], 'C': [0.1, 1, 10]}
    model = SVC()
    grid_clf = GridSearchCV(model, param_grid = search_grid)
    grid_clf.fit(train, train_labels)

    #evaluate the baseline model
    train_pred = grid_clf.predict(train)
    print('SVM baseline train accuracy = {}'.format((train_pred == train_labels).mean()))
    test_pred = grid_clf.predict(test)
    print('SVM baseline test accuracy = {}'.format((test_pred == test_labels).mean()))
    return grid_clf.best_estimator_

svm_model = svm_baseline(train_bow, train_data.target, test_bow, test_data.target)

SVM baseline train accuracy = 0.9482057627717871
SVM baseline test accuracy = 0.5452734997344663


In [14]:
from sklearn.linear_model import LogisticRegression
def lgc_baseline(train, train_labels, test, test_labels):
    search_grid = {'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10]}
    model = LogisticRegression()
    grid_clf = GridSearchCV(model, param_grid = search_grid)
    grid_clf.fit(train, train_labels)

    #evaluate the baseline model
    train_pred = grid_clf.predict(train)
    print('Logistic baseline train accuracy = {}'.format((train_pred == train_labels).mean()))
    test_pred = grid_clf.predict(test)
    print('Logistic baseline test accuracy = {}'.format((test_pred == test_labels).mean()))
    return grid_clf.best_estimator_

lgc_model = lgc_baseline(train_bow, train_data.target, test_bow, test_data.target)

Logistic baseline train accuracy = 0.9211596252430617
Logistic baseline test accuracy = 0.6428571428571429


In [40]:
def confusion_matrix(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    card = max(y_true + 1)
    mat = np.zeros([card, card])
    for i in range(len(y_true)):
        c_true = y_true[i]
        c_pred = y_pred[i]
        mat[c_true][c_pred] += 1
    return mat
# logistic regression has the best performance on test set
y_pred = lgc_model.predict(test_tf_idf)
confusion_mat = confusion_matrix(test_data.target, y_pred)

In [41]:
from sklearn.preprocessing import normalize
normed_confusion_mat = normalize(confusion_mat, axis=1, norm='l1')

In [76]:
for i in range(20):
    print(normed_confusion_mat[i][i])

0.341692789969
0.622107969152
0.505076142132
0.357142857143
0.633766233766
0.430379746835
0.528205128205
0.70202020202
0.804020100503
0.969773299748
0.669172932331
0.368686868687
0.312977099237
0.262626262626
0.535532994924
0.236180904523
0.381868131868
0.505319148936
0.1
0.111553784861


In [78]:
print(normed_confusion_mat)

[[ 0.34169279  0.0031348   0.0031348   0.          0.          0.
   0.0031348   0.04075235  0.06583072  0.47335423  0.          0.0031348
   0.0031348   0.0031348   0.01253918  0.02194357  0.0031348   0.          0.
   0.02194357]
 [ 0.          0.62210797  0.04113111  0.00257069  0.01285347  0.01799486
   0.00257069  0.02570694  0.0437018   0.2159383   0.          0.
   0.00771208  0.          0.00771208  0.          0.          0.          0.
   0.        ]
 [ 0.          0.04822335  0.50507614  0.03553299  0.04314721  0.00761421
   0.          0.0177665   0.05583756  0.27411168  0.          0.
   0.00253807  0.          0.01015228  0.          0.          0.          0.
   0.        ]
 [ 0.          0.03826531  0.06632653  0.35714286  0.0994898   0.
   0.03061224  0.04336735  0.04591837  0.28316327  0.00255102  0.
   0.03316327  0.          0.          0.          0.          0.          0.
   0.        ]
 [ 0.0025974   0.01038961  0.00519481  0.01558442  0.63376623  0.
   0.007792

In [85]:
pairwise = np.zeros([20, 20])
for i in range(20):
    for j in range(20):
        if i == j:
            continue
        pairwise[i][j] = normed_confusion_mat[i][j] * normed_confusion_mat[j][i] 

In [86]:
confuses = []
for i in range(20):
    j = np.argmax(pairwise[i])
    confuses.append([i, j, pairwise[i][j]])
# class 7 & 8 have the most pairwise confusion probability
max(confuses, key = lambda tri: tri[2])

[7, 8, 0.003134358661996853]