In [3]:
from sklearn.cross_validation import train_test_split

# finds a linear separation between the classes - needs kernel?
from sklearn.svm import LinearSVC

# random forest
from sklearn.ensemble import RandomForestClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# SVM

In [4]:
svm = LinearSVC()
svm.fit(X_train, y_train);
svm.predict(X_train)

In [None]:
svm.score(X_train, y_train)

In [None]:
svm.score(X_test, y_test)

# Random Forests

In [None]:
# instantiate the model
rf = RandomForestClassifier()

In [None]:
# fit the model
rf.fit(X_train, y_train);

In [None]:
# evalutate
rf.score(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

# Evaluation

In [None]:
# measure performance using cross-validation
# this example measures performance of random forests (above)
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(rf, X_train, y_train, cv=5)

print("scores: %d mean: %f std: %f" % (str(scores), np.mean(scores), np.std(scores)))

In [None]:
# if needed, try adding more trees?
rf2 = RandomForestClassifier(n_estimators=50)
scores = cross_val_score(rf2, X_train, y_train, cv=5)

print("scores: %d mean: %f std: %f" % (str(scores), np.mean(scores), np.std(scores)))

# Adjust important parameters using grid search

In [None]:
from sklearn.grid_search import GridSearchCV

In [None]:
# using example from linearSVC
param_grid = {'C': 10. ** np.arrange(-3, 4)}
grid_search = GridSearchCV(svm, param_grid=param_grid, cv=3, verbose=3, 
                           compute_training_score=True)

In [None]:
grid_search.fit(X_train, y_train);

In [None]:
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
# can then plot them on graph to measure validation and testing error
plt.figsize(12, 6)
plt.plot([c.mean_validation_score for c in grid_search.cv_scores_], label="validation error")
plt.plot([c.mean_training_score for c in grid_search.cv_scores_], label="training error")
plt.xticks(np.arange(6), param_grid['C']);
plt.xlabel("C"); plt.ylabel("Accuracy");plt.legend(loc='best');

# Overfitting and Complexity Control

- remember that overfitting = high variance -> no generalization
- remember than underfitting = high bias

# Notes

- linear classifiers are usually the best for text data
- LinearSVC -> LinearSVM that is efficient for sparse data

# Next Steps

- Grid search c parameter of LinearSVC
- Build a pipeline, adjust parameters of feature extraction
- Combine different feature extraction methods

# TL;DR

- Get your data into an array (n_samples, n_features). 
- model.fit(X), model.predict(X) / model.transform(X) 
- Always do cross-validation. Leave the test set until the end. 
- Internalize the complexity / generalization tradeoff.

# ====================================

In [44]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

train_text = [
     'This is the first document.',
     'This is the second second document.',
     'And the third one.',
     'Is this the first document?']
test_text = ['my test document']

vectorizer = TfidfVectorizer( 
                             decode_error=u'strict', strip_accents=None, 
                             lowercase=True, preprocessor=None, 
                             tokenizer=None, analyzer=u'char', 
                             stop_words=None, 
                             ngram_range=(1, 2), max_df=1.0, 
                             min_df=1, max_features=None, 
                             vocabulary=None, binary=False,  
                             norm=u'l2', use_idf=True, 
                             smooth_idf=True, sublinear_tf=False)

X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)
features = vectorizer.get_feature_names()
print(features)
print(len(features))


print(X_train.shape)

[' ', ' d', ' f', ' i', ' o', ' s', ' t', '.', '?', 'a', 'an', 'c', 'co', 'cu', 'd', 'd ', 'do', 'e', 'e ', 'e.', 'ec', 'en', 'f', 'fi', 'h', 'he', 'hi', 'i', 'ir', 'is', 'm', 'me', 'n', 'nd', 'ne', 'nt', 'o', 'oc', 'on', 'r', 'rd', 'rs', 's', 's ', 'se', 'st', 't', 't ', 't.', 't?', 'th', 'u', 'um']
53
(4, 53)


# ====================================

# Project 2

- Use Pandas for reading data into a Pandas DataFrame
- Use native Pandas features to process text features to numerical ones
- Use the extremely convenient "dummies" feature of the Pandas library to convert categorical features to binary ones. (One-Hot encoding). Scikit has it's own One-Hot Encoding routine but it only works with integers (Features with categories like 1,2,3 rather than 'a','b','c'). Pandas can digest anything thrown at it.
- Finally, explicitly cast the DataFrame into a numpy array which can be used  by the scikit-learn API. Note that at this point you lose your feature labels (Headers), so it would be difficult to keep track of the features if you use the "feature-importance" routine in scikit-learn. I have the practice of saving the headers before casting the data-frame into a numpy array. [>>list(<DataFrame>) prints out the headers into a nice list]

In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
languages = ['ab', 'bg', 'de', 'en', 'es', 'fa', 'fr', 'he', 'hi', 'it', 'ja', 'ko', 'mr', 'ne', 'nl', 'ru', 'th', 'uk', 'ur', 'zh']

In [3]:
# test.json
df_test = pd.concat([pd.Series(json.loads(line)) for line in open('test/test.json')], axis=1).T

### Training Sets

In [4]:
# training.json
df_training = pd.concat([pd.Series(json.loads(line)) for line in open('train.json')], axis=1).T

In [5]:
# dev.json
df_dev = pd.concat([pd.Series(json.loads(line)) for line in open('dev.json')], axis=1).T

### Slicing the training set

In [6]:
df_dev = df_dev['lang'].ix[:3702]

In [7]:
df_training = df_training.ix[:3702]

### Tokenizing text (with scikit-learn)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features=100)
X_train_counts = count_vect.fit_transform(df_training['text'])
X_train_counts.shape

(3703, 100)

In [9]:
y_train_counts = count_vect.transform(df_dev)
y_train_counts.shape

(3703, 100)

### From occurences to frequencies
Good for baseline, however, longer documents will have higher avreage count values than shorter documents.

To avoid these potential discrepancies it suffices to divide the number of occurences of each word in a document by the total number of words in the document: these new features are called `tf` for Term Frequencies.

Another refinement on top of `tf` is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that only occur only in smaller portion of the corpus.
i.e. words or n_grams which occur in many tweets, such as `'the'` should have a smaller weighting as they occur so often. Words which occur 2 ~ 4 times are more valuable. Words which occur once are pretty useless.

This downscaling is called `tf-idf` for "Term Frequency times Inverse Document Frequency"

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
# fit estimator to the data
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
# transform count-matrix t tf-idf representation
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(3703, 100)

In [11]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3703, 100)

### Training a classifier

Using `naïve bayes` classifier as baseline

In [12]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, df_dev.values)

WTF DO YOU PREDICT IT ON???

In [None]:
# dev.json
df_dev_1 = pd.concat([pd.Series(json.loads(line)) for line in open('dev.json')], axis=1).T

In [None]:
df_dev_text = df_dev_1['text'].ix[:4000]

In [None]:
X_new_counts = count_vect.transform(df_dev_text.values)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(df_dev_text.values, predicted):
    print('%r => %s' % (doc, df_test))

### Building a pipeline

In [None]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])

### Evaluation of the performance on the test set

In [None]:
import numpy as np
doc_test = df_test
predicted = text_clf.predict(doc_test)
np.mean(predicted == twenty_test.target)