In [2]:
import numpy as np

import common
reload(common)

<module 'common' from 'common.pyc'>

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

class TfIdfFeatureExtractor(object):
    def __init__(self, ngram, min_df):
        self.vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=(1,ngram), stop_words="english")
        
    def train(self, train_data):
        features = self.vectorizer.fit_transform(train_data["content"])
        print "Vocab size %d" % len(self.vectorizer.vocabulary_)
        return features
        
    def transform(self, test_data):
        return self.vectorizer.transform(test_data["content"])

In [29]:
common.extract_features("ka-comments-balanced", TfIdfFeatureExtractor(1, 2), "all-tfidf-1")

Loading ka-comments-balanced dataset.
Training feature extractor all-tfidf-1.
Vocab size 64542
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [6]:
common.test_features("all-tfidf-1")

Loading features all-tfidf-1.
Training models.
##        MultinomialNB     all-tfidf-1 precision: 61.6% recall: 64.7%
##            LinearSVC     all-tfidf-1 precision: 62.1% recall: 63.2%
##                  MLP     all-tfidf-1 precision: 62.7% recall: 64.7%
##                 MLP2     all-tfidf-1 precision: 63.0% recall: 64.1%


In [31]:
common.extract_features("ka-comments-balanced", TfIdfFeatureExtractor(2, 6), "all-tfidf-2")

Training feature extractor all-tfidf-2.
Vocab size 181329
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [7]:
common.test_features("all-tfidf-2")

Loading features all-tfidf-2.
Training models.
##        MultinomialNB     all-tfidf-2 precision: 60.7% recall: 68.1%
##            LinearSVC     all-tfidf-2 precision: 58.8% recall: 62.0%
##                  MLP     all-tfidf-2 precision: 63.3% recall: 63.3%
##                 MLP2     all-tfidf-2 precision: 61.3% recall: 68.4%


In [33]:
common.extract_features("ka-replies-balanced", TfIdfFeatureExtractor(1, 2), "reply-tfidf-1")

Loading ka-replies-balanced dataset.
Training feature extractor reply-tfidf-1.
Vocab size 56396
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [8]:
common.test_features("reply-tfidf-1")

Loading features reply-tfidf-1.
Training models.
##        MultinomialNB   reply-tfidf-1 precision: 61.0% recall: 63.8%
##            LinearSVC   reply-tfidf-1 precision: 61.3% recall: 60.6%
##                  MLP   reply-tfidf-1 precision: 62.5% recall: 67.0%
##                 MLP2   reply-tfidf-1 precision: 62.1% recall: 63.6%


In [35]:
common.extract_features("ka-replies-balanced", TfIdfFeatureExtractor(2, 6), "reply-tfidf-2")

Training feature extractor reply-tfidf-2.
Vocab size 145427
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [9]:
common.test_features("reply-tfidf-2")

Loading features reply-tfidf-2.
Training models.
##        MultinomialNB   reply-tfidf-2 precision: 60.4% recall: 66.1%
##            LinearSVC   reply-tfidf-2 precision: 59.2% recall: 61.6%
##                  MLP   reply-tfidf-2 precision: 61.9% recall: 68.2%
##                 MLP2   reply-tfidf-2 precision: 62.1% recall: 63.6%


In [37]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import paired_cosine_distances

class TfIdfSimilarityExtractor(object):
    def __init__(self, ngram, min_df):
        self.vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=(1,ngram), stop_words="english")
        
    def train(self, train_data):
        self.vectorizer.fit(np.hstack((train_data["content"], train_data["content_parent"])))
        print "Vocab size %d" % len(self.vectorizer.vocabulary_)
        
        features_reply = self.vectorizer.transform(train_data["content"])
        features_parent = self.vectorizer.transform(train_data["content_parent"])
        
        d = paired_cosine_distances(features_reply, features_parent)
        d = np.reshape(d, (np.shape(d)[0],1))
        return d
        
    def transform(self, test_data):
        features_reply = self.vectorizer.transform(test_data["content"])
        features_parent = self.vectorizer.transform(test_data["content_parent"])
        
        d = paired_cosine_distances(features_reply, features_parent)
        d = np.reshape(d, (np.shape(d)[0],1))
        return d

In [38]:
common.extract_features("ka-replies-balanced", TfIdfSimilarityExtractor(1, 2), "diff-tfidf-1")

Training feature extractor diff-tfidf-1.
Vocab size 70230
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [10]:
common.test_features("diff-tfidf-1")

Loading features diff-tfidf-1.
Training models.
##        MultinomialNB    diff-tfidf-1 precision: 0.0% recall: 0.0%
##            LinearSVC    diff-tfidf-1 precision: 53.8% recall: 48.8%
##                  MLP    diff-tfidf-1 precision: 55.6% recall: 74.2%
##                 MLP2    diff-tfidf-1 precision: 55.6% recall: 73.0%


In [40]:
common.extract_features("ka-replies-balanced", TfIdfSimilarityExtractor(2, 6), "diff-tfidf-2")

Training feature extractor diff-tfidf-2.
Vocab size 228695
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [11]:
common.test_features("diff-tfidf-2")

Loading features diff-tfidf-2.
Training models.
##        MultinomialNB    diff-tfidf-2 precision: 0.0% recall: 0.0%
##            LinearSVC    diff-tfidf-2 precision: 53.7% recall: 44.8%
##                  MLP    diff-tfidf-2 precision: 55.7% recall: 65.6%
##                 MLP2    diff-tfidf-2 precision: 57.2% recall: 67.5%


In [12]:
common.test_combined_features(["diff-tfidf-2", "reply-tfidf-2"])

Loading features diff-tfidf-2.
Loading features reply-tfidf-2.
Combining features: (308216, 1) + (308216, 145427)
Training models.
##        MultinomialNB diff-tfidf-2_reply-tfidf-2 precision: 61.8% recall: 65.1%
##            LinearSVC diff-tfidf-2_reply-tfidf-2 precision: 59.2% recall: 61.5%
##                  MLP diff-tfidf-2_reply-tfidf-2 precision: 61.5% recall: 66.1%
##                 MLP2 diff-tfidf-2_reply-tfidf-2 precision: 62.3% recall: 65.1%


In [5]:
reload(common)
common.extract_features("ka-rank-balanced", TfIdfFeatureExtractor(1, 2), "rank-tfidf-1")

Loading ka-rank-balanced dataset.
Training feature extractor rank-tfidf-1.
Vocab size 60824
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [13]:
common.test_features("rank-tfidf-1")

Loading features rank-tfidf-1.
Training models.
##        MultinomialNB    rank-tfidf-1 precision: 53.3% recall: 60.1%
##            LinearSVC    rank-tfidf-1 precision: 51.2% recall: 43.9%
##                  MLP    rank-tfidf-1 precision: 52.6% recall: 42.3%
##                 MLP2    rank-tfidf-1 precision: 52.6% recall: 48.2%


In [7]:
common.extract_features("ka-rank-balanced", TfIdfFeatureExtractor(2, 6), "rank-tfidf-2")

Training feature extractor rank-tfidf-2.
Vocab size 158751
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [14]:
common.test_features("rank-tfidf-2")

Loading features rank-tfidf-2.
Training models.
##        MultinomialNB    rank-tfidf-2 precision: 54.0% recall: 56.5%
##            LinearSVC    rank-tfidf-2 precision: 51.8% recall: 46.2%
##                  MLP    rank-tfidf-2 precision: 52.3% recall: 46.8%
##                 MLP2    rank-tfidf-2 precision: 51.5% recall: 41.2%


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse

class TfIdfComparisonFeatureExtractor(object):
    def __init__(self, ngram, min_df):
        self.vectorizer = TfidfVectorizer(min_df=min_df, ngram_range=(1,ngram), stop_words="english")
        
    def train(self, train_data):
        content_concat = np.hstack((train_data["content_left"], train_data["content_right"]))
        self.vectorizer.fit(content_concat)
        
        left = self.vectorizer.transform(train_data["content_left"])
        right = self.vectorizer.transform(train_data["content_right"])
        features = scipy.sparse.hstack((left, right))

        print "Vocab size %d" % len(self.vectorizer.vocabulary_)
        return features
        
    def transform(self, test_data):
        return scipy.sparse.hstack((
            self.vectorizer.transform(test_data["content_left"]),
            self.vectorizer.transform(test_data["content_right"])))

In [35]:
common.extract_features("ka-binary", TfIdfComparisonFeatureExtractor(1, 2), "binary-tfidf-1")

Training feature extractor binary-tfidf-1.
Vocab size 108632
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [None]:
common.test_features("binary-tfidf-1")

Loading features binary-tfidf-1.
Training models.
##        MultinomialNB  binary-tfidf-1 precision: 53.3% recall: 54.8%
##            LinearSVC  binary-tfidf-1 precision: 53.8% recall: 55.7%
##                  MLP  binary-tfidf-1 precision: 53.2% recall: 49.0%
##                 MLP2  binary-tfidf-1 precision: 52.6% recall: 57.8%


In [None]:
common.extract_features("ka-binary", TfIdfComparisonFeatureExtractor(2, 6), "binary-tfidf-2")

Training feature extractor binary-tfidf-2.
Vocab size 640047
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [None]:
common.test_features("binary-tfidf-2")

Loading features binary-tfidf-2.
Training models.
##        MultinomialNB  binary-tfidf-2 precision: 54.0% recall: 55.2%


In [9]:
reload(common)
common.show_errors("ka-rank-balanced", "rank-tfidf-1", "MultinomialNB")

Loading ka-rank-balanced dataset.
Loading features rank-tfidf-1.
Training model MultinomialNB.
## True positives: 0.300400
## True negatives: 0.236400
## False positives: 0.263600
## False negatives: 0.199600
True positives:
198779      what is the lowest common multiple of 68 and 120
198780     The LCM of 68 and 120 is 2040.  To come to thi...
223959                    This the the brain treaser at 6:46
297269                       fun but impossible in real life
376698     Is there an activity for us to practice lattic...
446368                  a half is equivalent to two quarters
446369     1/2 is the equivalent of 2/4 because 1/2 times...
447079     Recognise whether the fraction are equivalent:...
463389     4.27 why do we have to add 2 6 times becase we...
508101     Yesterday, two friends went into a bank to ope...
573406                            Can fractions be negative?
728014     You have to to remove the blocks from the scal...
749686     19 + 9 = 28 \nOr you can do 19 +