In [27]:
import common
reload(common)

<module 'common' from 'common.pyc'>

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

class TfIdfFeatureExtractor(object):
    def __init__(self, ngram, min_df):
        self.vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=(1,ngram), stop_words="english")
        
    def train(self, train_data):
        features = self.vectorizer.fit_transform(train_data["content"])
        print "Vocab size %d" % len(self.vectorizer.vocabulary_)
        return features
        
    def transform(self, test_data):
        return self.vectorizer.transform(test_data["content"])

In [29]:
common.extract_features("ka-comments-balanced", TfIdfFeatureExtractor(1, 2), "all-tfidf-1")

Loading ka-comments-balanced dataset.
Training feature extractor all-tfidf-1.
Vocab size 64542
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [30]:
common.test_features("all-tfidf-1")

Loading features all-tfidf-1.
Training models.
##        MultinomialNB     all-tfidf-1 accuracy: 62.2 %
##            LinearSVC     all-tfidf-1 accuracy: 62.3 %
##                  MLP     all-tfidf-1 accuracy: 63.4 %


In [31]:
common.extract_features("ka-comments-balanced", TfIdfFeatureExtractor(2, 6), "all-tfidf-2")

Training feature extractor all-tfidf-2.
Vocab size 181329
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [32]:
common.test_features("all-tfidf-2")

Loading features all-tfidf-2.
Training models.
##        MultinomialNB     all-tfidf-2 accuracy: 62.0 %
##            LinearSVC     all-tfidf-2 accuracy: 59.3 %
##                  MLP     all-tfidf-2 accuracy: 63.7 %


In [33]:
common.extract_features("ka-replies-balanced", TfIdfFeatureExtractor(1, 2), "reply-tfidf-1")

Loading ka-replies-balanced dataset.
Training feature extractor reply-tfidf-1.
Vocab size 56396
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [34]:
common.test_features("reply-tfidf-1")

Loading features reply-tfidf-1.
Training models.
##        MultinomialNB   reply-tfidf-1 accuracy: 61.5 %
##            LinearSVC   reply-tfidf-1 accuracy: 61.1 %
##                  MLP   reply-tfidf-1 accuracy: 63.9 %


In [35]:
common.extract_features("ka-replies-balanced", TfIdfFeatureExtractor(2, 6), "reply-tfidf-2")

Training feature extractor reply-tfidf-2.
Vocab size 145427
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [36]:
common.test_features("reply-tfidf-2")

Loading features reply-tfidf-2.
Training models.
##        MultinomialNB   reply-tfidf-2 accuracy: 61.4 %
##            LinearSVC   reply-tfidf-2 accuracy: 59.6 %
##                  MLP   reply-tfidf-2 accuracy: 63.4 %


In [37]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import paired_cosine_distances

class TfIdfSimilarityExtractor(object):
    def __init__(self, ngram, min_df):
        self.vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=(1,ngram), stop_words="english")
        
    def train(self, train_data):
        self.vectorizer.fit(np.hstack((train_data["content"], train_data["content_parent"])))
        print "Vocab size %d" % len(self.vectorizer.vocabulary_)
        
        features_reply = self.vectorizer.transform(train_data["content"])
        features_parent = self.vectorizer.transform(train_data["content_parent"])
        
        d = paired_cosine_distances(features_reply, features_parent)
        d = np.reshape(d, (np.shape(d)[0],1))
        return d
        
    def transform(self, test_data):
        features_reply = self.vectorizer.transform(test_data["content"])
        features_parent = self.vectorizer.transform(test_data["content_parent"])
        
        d = paired_cosine_distances(features_reply, features_parent)
        d = np.reshape(d, (np.shape(d)[0],1))
        return d

In [38]:
common.extract_features("ka-replies-balanced", TfIdfSimilarityExtractor(1, 2), "diff-tfidf-1")

Training feature extractor diff-tfidf-1.
Vocab size 70230
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [39]:
common.test_features("diff-tfidf-1")

Loading features diff-tfidf-1.
Training models.
##        MultinomialNB    diff-tfidf-1 accuracy: 50.0 %
##            LinearSVC    diff-tfidf-1 accuracy: 53.5 %
##                  MLP    diff-tfidf-1 accuracy: 57.6 %


In [40]:
common.extract_features("ka-replies-balanced", TfIdfSimilarityExtractor(2, 6), "diff-tfidf-2")

Training feature extractor diff-tfidf-2.
Vocab size 228695
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [41]:
common.test_features("diff-tfidf-2")

Loading features diff-tfidf-2.
Training models.
##        MultinomialNB    diff-tfidf-2 accuracy: 50.0 %
##            LinearSVC    diff-tfidf-2 accuracy: 53.1 %
##                  MLP    diff-tfidf-2 accuracy: 58.4 %


In [42]:
common.test_combined_features(["diff-tfidf-2", "reply-tfidf-2"])

Loading features diff-tfidf-2.
Loading features reply-tfidf-2.
Combining features.
Training models.
##        MultinomialNB     diff-tfidf-2_reply-tfidf-2 accuracy: 62.4 %
##            LinearSVC     diff-tfidf-2_reply-tfidf-2 accuracy: 59.5 %
##                  MLP     diff-tfidf-2_reply-tfidf-2 accuracy: 62.1 %
