In [1]:
import numpy as np

import common
reload(common)

<module 'common' from 'common.pyc'>

In [2]:
import scipy.sparse

class ContentLengthFeatureExtractor(object):
    def train(self, train_data):
        return self.transform(train_data)
        
    def transform(self, test_data):
        return scipy.sparse.csr_matrix([[len(c)] for c in test_data["content"]])

In [15]:
common.extract_features("ka-comments-balanced", ContentLengthFeatureExtractor(), "all-len")

Loading ka-comments-balanced dataset.
Training feature extractor all-len.
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [2]:
common.test_features("all-len")

Loading features all-len.
Training models.
##        MultinomialNB         all-len precision: 0.0% recall: 0.0%
##            LinearSVC         all-len precision: 57.0% recall: 83.2%
##                  MLP         all-len precision: 62.9% recall: 59.4%
##                 MLP2         all-len precision: 60.3% recall: 69.9%


In [15]:
common.extract_features("ka-replies-balanced", ContentLengthFeatureExtractor(), "reply-len")

Loading ka-replies-balanced dataset.
Training feature extractor reply-len.
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [3]:
common.test_features("reply-len")

Loading features reply-len.
Training models.
##        MultinomialNB       reply-len precision: 0.0% recall: 0.0%
##            LinearSVC       reply-len precision: 63.0% recall: 58.2%
##                  MLP       reply-len precision: 61.8% recall: 64.3%
##                 MLP2       reply-len precision: 61.7% recall: 64.6%


In [18]:
common.extract_features("ka-rank-balanced", ContentLengthFeatureExtractor(), "rank-len")

Loading ka-rank-balanced dataset.
Training feature extractor rank-len.
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [4]:
reload(common)
common.test_features("rank-len")

Loading features rank-len.
Training models.
##        MultinomialNB        rank-len precision: 0.0% recall: 0.0%
##            LinearSVC        rank-len precision: 43.1% recall: 48.2%
##                  MLP        rank-len precision: 39.1% recall: 21.8%
##                 MLP2        rank-len precision: 39.4% recall: 21.5%


In [12]:
class ContentCommentCountFeatureExtractor(object):
    def train(self, train_data):
        return self.transform(train_data)
        
    def transform(self, test_data):
        return scipy.sparse.csr_matrix(np.reshape(test_data["totalComments"], (np.shape(test_data)[0], 1)))

In [23]:
common.extract_features("ka-comments-balanced", ContentCommentCountFeatureExtractor(), "all-counts")

Loading ka-comments-balanced dataset.
Training feature extractor all-counts.
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [5]:
common.test_features("all-counts")

Loading features all-counts.
Training models.
##        MultinomialNB      all-counts precision: 0.0% recall: 0.0%
##            LinearSVC      all-counts precision: 46.0% recall: 85.1%
##                  MLP      all-counts precision: 43.7% recall: 32.3%
##                 MLP2      all-counts precision: 47.6% recall: 24.4%


In [25]:
common.extract_features("ka-replies-balanced", ContentCommentCountFeatureExtractor(), "reply-counts")

Loading ka-replies-balanced dataset.
Training feature extractor reply-counts.
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [6]:
common.test_features("reply-counts")

Loading features reply-counts.
Training models.
##        MultinomialNB    reply-counts precision: 0.0% recall: 0.0%
##            LinearSVC    reply-counts precision: 50.0% recall: 100.0%
##                  MLP    reply-counts precision: 68.7% recall: 38.2%
##                 MLP2    reply-counts precision: 65.6% recall: 39.5%


In [20]:
common.extract_features("ka-rank-balanced", ContentCommentCountFeatureExtractor(), "rank-counts")

Loading ka-rank-balanced dataset.
Training feature extractor rank-counts.
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [7]:
common.test_features("rank-counts")

Loading features rank-counts.
Training models.
##        MultinomialNB     rank-counts precision: 0.0% recall: 0.0%
##            LinearSVC     rank-counts precision: 54.0% recall: 99.4%
##                  MLP     rank-counts precision: 60.9% recall: 97.3%
##                 MLP2     rank-counts precision: 60.6% recall: 98.2%


In [8]:
common.test_combined_features(["all-counts", "all-len"])

Loading features all-counts.
Loading features all-len.
Combining features: (488240, 1) + (488240, 1)
Training models.
##        MultinomialNB all-counts_all-len precision: 56.3% recall: 61.2%
##            LinearSVC all-counts_all-len precision: 50.0% recall: 100.0%
##                  MLP all-counts_all-len precision: 49.4% recall: 27.4%
##                 MLP2 all-counts_all-len precision: 50.2% recall: 42.2%


In [9]:
common.test_combined_features(["reply-counts", "reply-len"])

Loading features reply-counts.
Loading features reply-len.
Combining features: (308216, 1) + (308216, 1)
Training models.
##        MultinomialNB reply-counts_reply-len precision: 63.1% recall: 61.6%
##            LinearSVC reply-counts_reply-len precision: 50.0% recall: 100.0%
##                  MLP reply-counts_reply-len precision: 71.7% recall: 35.6%
##                 MLP2 reply-counts_reply-len precision: 70.6% recall: 38.3%


In [10]:
reload(common)
common.test_combined_features(["rank-counts", "rank-len"])

Loading features rank-counts.
Loading features rank-len.
Combining features: (407128, 1) + (407128, 1)
Training models.
##        MultinomialNB rank-counts_rank-len precision: 61.7% recall: 92.7%
##            LinearSVC rank-counts_rank-len precision: 0.0% recall: 0.0%
##                  MLP rank-counts_rank-len precision: 68.8% recall: 83.4%
##                 MLP2 rank-counts_rank-len precision: 62.9% recall: 94.4%


In [8]:
import scipy.sparse

class ContentComparisonLengthFeatureExtractor(object):
    def train(self, train_data):
        return self.transform(train_data)
        
    def transform(self, test_data):
        return scipy.sparse.csr_matrix([
            [len(left), len(right)]
            for left, right in zip(test_data["content_left"], test_data["content_right"])
        ])


In [9]:
common.extract_features("ka-binary", ContentComparisonLengthFeatureExtractor(), "binary-len")

Training feature extractor binary-len.
Generating validation set...
Generating test set...
Writing to disk...
Done.


In [11]:
common.test_features("binary-len")

Loading features binary-len.
Training models.
##        MultinomialNB      binary-len precision: 51.5% recall: 51.8%
##            LinearSVC      binary-len precision: 50.0% recall: 100.0%
##                  MLP      binary-len precision: 51.4% recall: 51.9%
##                 MLP2      binary-len precision: 51.3% recall: 60.2%
