In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)

num_test = len(newsgroups_test.target)
test_data, test_labels = newsgroups_test.data[num_test/2:], newsgroups_test.target[num_test/2:]
dev_data, dev_labels = newsgroups_test.data[:num_test/2], newsgroups_test.target[:num_test/2]
train_data, train_labels = newsgroups_train.data, newsgroups_train.target

print 'training label shape:', train_labels.shape
print 'test label shape:', test_labels.shape
print 'dev label shape:', dev_labels.shape
print 'labels names:', newsgroups_train.target_names

training label shape: (2034,)
test label shape: (677,)
dev label shape: (676,)
labels names: ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']


In [3]:
num_examples = 5
for i in range(num_examples):
    example = train_data[i]
    label_index = train_labels[i]
    label_name = newsgroups_train.target_names[label_index]
    
    print '***** BEGIN EXAMPLE ' + str(i) + ' *****'
    print 'Training label: ' + str(label_index) + ' (' + label_name + ')'
    print 'Text:'
    print '-----'
    
    print ''
    print example
    print ''
    print '***** END EXAMPLE ' + str(i) + ' *****'
    print ''


***** BEGIN EXAMPLE 0 *****
Training label: 1 (comp.graphics)
Text:
-----

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych

***** END EXAMPLE 0 *****

***** BEGIN EXAMPLE 1 *****
Training label: 3 (talk.religion.misc)
Text:
-----



Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to take a whole bunch of
folks with him, children and all, to satisfy his delusional 

(2) Use CountVectorizer to turn the raw training text into feature vectors. You should use the fit_transform function, which makes 2 passes through the data: first it computes the vocabulary ("fit"), second it converts the raw text into feature vectors using the vocabulary ("transform").

The vectorizer has a lot of options. To get familiar with some of them, write code to answer these questions:

a. The output of the transform (also of fit_transform) is a sparse matrix: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.csr_matrix.html. What is the size of the vocabulary? What is the average number of non-zero features per example? What fraction of the entries in the matrix are non-zero? Hint: use "nnz" and "shape" attributes.

b. What are the 0th and last feature strings (in alphabetical order)? Hint: use the vectorizer's get_feature_names function.

c. Specify your own vocabulary with 4 words: ["atheism", "graphics", "space", "religion"]. Confirm the training vectors are appropriately shaped. Now what's the average number of non-zero features per example?

d. Instead of extracting unigram word features, use "analyzer" and "ngram_range" to extract bigram and trigram character features. What size vocabulary does this yield?

e. Use the "min_df" argument to prune words that appear in fewer than 10 documents. What size vocabulary does this yield?

f. Using the standard CountVectorizer, what fraction of the words in the dev data are missing from the vocabulary? Hint: build a vocabulary for both train and dev and look at the size of the difference.

In [135]:
def vectorize(data):
    vectorizer = CountVectorizer()
    vectorizer.fit(data)
    return vectorizer

def select_nonzero_indices(example):
    return example.nonzero()[1]

def select_features(example):
    indices = select_nonzero_indices(example)
    features = [
        feature_name
        for index, feature_name
        in enumerate(feature_names)
        if index in indices
    ]
    return features

def select_non_zero_vector(examples):
    (rows, columns) = np.shape(examples)
    nnz_vector = [
        examples[index].nnz
        for index
        in range(rows)
    ]
    
    return nnz_vector

def slice_by_columns(csr_matrix, column_indices):
    csc_matrix = csr_matrix.tocsc()
    csc_sliced = csc_matrix[:, column_indices]
    return csc_sliced.tocsr()

# def P2():
vectorizer = vectorize(train_data)
train_vectorized = vectorizer.transform(train_data)

(examples, features) = np.shape(train_vectorized)
non_zero_total = train_vectorized.nnz
non_zero_vector = select_non_zero_vector(train_vectorized)
feature_names = vectorizer.get_feature_names()

print '***** (2) a. Output of the transform *****'
print ''
print '  Size of the vocabulary: ' + str(features)
print '  Average non-zero features: ' + str(np.mean(non_zero_vector))
print '  Non-zero fraction: ' + str((non_zero * 1.0) / (examples * features * 1.0))
print ''

zeroeth_example = train_vectorized[0]
last_example = train_vectorized[examples - 1]
next_to_last_example = train_vectorized[examples - 2]

zeroeth_features = select_features(zeroeth_example)
last_features = select_features(last_example)
next_to_last_features = select_features(next_to_last_example)


print '***** (2) b. 0th and last feature strings *****'
print ''
print '  0th feature string: '
print '    ' + ' '.join(zeroeth_features)
print '  last feature string: '
print '    ' + ' '.join(last_features)
print '  next to last feature string (last is empty): '
print '    ' + ' '.join(next_to_last_features)
print ''
    

# P2()



***** (2) a. Output of the transform *****

  Size of the vocabulary: 26879
  Average non-zero features: 96.7059980334
  Non-zero fraction: 0.00359782722696

***** (2) b. 0th and last feature strings *****

  0th feature string: 
    3ds able about after all and anyone are available be but carefully cel default does explicitly file for format from given have hi if in information is it know like manual mapping model not nothing noticed only orientation planes position positioned positions preserved prj read reload restarting rule rules rych said save saving somewhere stored texture that the their they this to ve when why with you your
  last feature string: 
    
  next to last feature string (last is empty): 
    _anything_ _behaviors_ _knowledgeable _waving about again all an and any application aquainted around as assertion atheism because becomes becoming being belief believe believer believer_ bit blah by call can carelessly caste certain circular coming contradiction conversely de

In [138]:




four_word_vocabulary = [ 'atheism', 'graphics', 'space', 'religion' ]
four_word_indices = [
    index
    for index, feature_name
    in enumerate(feature_names)
    if feature_name in four_word_vocabulary
]

four_word_train_vectorized = slice_by_columns(train_vectorized, four_word_indices)
four_word_non_zero_vector = select_non_zero_vector(four_word_train_vectorized)


print '***** (2) c. 4 word vocabulary *****'
print ''

print '  Shape of four-word vocabulary vectored training data: ' + \
    str(np.shape(four_word_train_vectorized))
print '  Average non-zero: ' + str(np.mean(four_word_non_zero_vector))
print ''



***** (2) c. 4 word vocabulary *****

  Shape of four-word vocabulary vectored training data: (2034, 4)
  Average non-zero: 0.268436578171



In [144]:
def vectorize_as_character_bigrams_trigrams(data):
    vectorizer = CountVectorizer(analyzer = 'char_wb', ngram_range = (2, 3))
    vectorizer.fit(data)
    return vectorizer

vectorizer = vectorize_as_character_bigrams_trigrams(train_data)
train_char_bigram_trigram_vectorized = vectorizer.transform(train_data)
(examples, features) = np.shape(train_char_bigram_trigram_vectorized)

print '***** (2) d. bigram/trigram character features vocabulary *****'
print ''
print '  Number of features (word-boundary character bigram/trigram): ' + str(features)
print ''

***** (2) d. bigram/trigram character features vocabulary *****

  Number of features (word-boundary character bigram/trigram): 28954



In [145]:
def vectorize_with_document_frequency_pruning(data, min_df = 10):
    vectorizer = CountVectorizer(min_df = min_df)
    vectorizer.fit(data)
    return vectorizer

vectorizer = vectorize_with_document_frequency_pruning(train_data, 10)
train_vectorized_pruned_min_df_10 = vectorizer.transform(train_data)
(examples, features) = np.shape(train_vectorized_pruned_min_df_10)

print '***** (2) e. mininum document frequency 10 vocabulary *****'
print ''
print '  Number of features (word-boundary character bigram/trigram): ' + str(features)
print ''

***** (2) e. mininum document frequency 10 vocabulary *****

  Number of features (word-boundary character bigram/trigram): 3064



In [154]:
train_feature_names = vectorize(train_data).get_feature_names()
dev_feature_names = vectorize(dev_data).get_feature_names()
feature_names_intersection = set(train_feature_names).intersection(dev_feature_names)

missing_fraction = 1.0 - ((1.0 * len(feature_names_intersection)) / (1.0 * len(dev_feature_names)))

print '***** (2) f. fraction of missing dev vocabulary *****'
print ''
print '  Fraction of missing dev features from train vocabulary: ' + str(missing_fraction)
print ''

len(dev_feature_names)


***** (2) f. fraction of missing dev vocabulary *****

  Fraction of missing dev features from train vocabulary: 0.247876400345



16246

In [170]:
from sklearn.metrics.pairwise import pairwise_distances

vectorizer = vectorize(train_data)
train_vectorized = vectorizer.transform(train_data)
dev_vectorized = vectorizer.transform(dev_data)

def best_nearest_neighbors(train_vectorized, train_labels, n_neighbors = range(1, 10)):
    
    hyperparameters = { 'n_neighbors' : n_neighbors } 
    nearest_neighbors = KNeighborsClassifier()
    
    grid_search = GridSearchCV(nearest_neighbors, hyperparameters, verbose = 3, scoring = 'f1')
    grid_search.fit(train_vectorized, train_labels)
    
    return grid_search

nearest_neighbors = best_nearest_neighbors(train_vectorized, train_labels)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] n_neighbors=1 ...................................................
[CV] .......................... n_neighbors=1, score=0.428715 -   0.1s
[CV] n_neighbors=1 ...................................................
[CV] .......................... n_neighbors=1, score=0.381899 -   0.1s
[CV] n_neighbors=1 ...................................................
[CV] .......................... n_neighbors=1, score=0.403747 -   0.1s
[CV] n_neighbors=2 ...................................................
[CV] .......................... n_neighbors=2, score=0.410820 -   0.1s
[CV] n_neighbors=2 ...................................................
[CV] .......................... n_neighbors=2, score=0.367443 -   0.1s
[CV] n_neighbors=2 ...................................................
[CV] .......................... n_neighbors=2, score=0.393002 -   0.1s
[CV] n_neighbors=3 ...................................................
[CV] ............

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    2.3s finished





In [176]:
best_n_neighbors = nearest_neighbors.get_params()['estimator'].get_params()['n_neighbors']
dev_predictions = nearest_neighbors.predict(dev_vectorized)
knn_f1_score = metrics.f1_score(dev_labels, dev_predictions)

print '***** (3) a. k nearest neighbors *****'
print ''
print '  Best k: ' + str(best_n_neighbors)
print '  F1 score over dev data: ' + str(knn_f1_score)
print ''

***** (3) a. k nearest neighbors *****

  Best k: 5
  F1 score over dev data: 0.43656661762



In [181]:
nearest_5_neighbors = KNeighborsClassifier(n_neighbors = 5)
nearest_5_neighbors.fit(train_vectorized, train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')

In [204]:


def cosine_distance(a, b):
    return pairwise_distances(a, b, metric='cosine')[0][0]

def select_nearest_neighbors(model, examples, example):
    neighbor_indices = model.kneighbors(example)[1][0]
    return examples[neighbor_indices]

def mean_neighbor_cosine_distance(model, examples):
    (n_examples, features) = np.shape(examples)
    cosine_distances = [ ]
    
    for example_index in range(n_examples):
        example = examples[example_index]
        neighbors = select_nearest_neighbors(model, examples, example)
        (n_neighbors, features) = np.shape(neighbors)

        example_cosine_distances = [
            cosine_distance(neighbors[index], example)
            for index in range(n_neighbors)
            if index != example_index
        ]

        cosine_distances += example_cosine_distances
    
    return np.mean(cosine_distances)
                                  
mean_distance = mean_neighbor_cosine_distance(nearest_5_neighbors, train_vectorized)
print ' Mean cosine distance to nearest neighbors: ' + str(mean_distance)

 Mean cosine distance to nearest neighbors: 0.497190184605
