In [15]:
number_of_features = 5

In [16]:
from pybrain.tools.shortcuts import buildNetwork

# Initializes an array of five empty models
def initialize_models():
    models = []
    
    for _ in range(5):
        models.append(buildNetwork(number_of_features, 3, 1))
        
    return models

In [10]:
# Takes in an existing model and a list of Example()s
# Expected to run full training of the model using
# all examples, and then return the model.
# Each model has .review and .votes
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from pybrain.datasets import SupervisedDataSet
from pybrain.supervised.trainers import BackpropTrainer

vectorizer = None
selector = None

def train_model(model, examples):
    examples, votes = zip(*[(e.review, e.votes['useful'] > 0) for e in examples])
    
    # Use previously defined vectorizer
    global vectorizer
    vectorizer = TfidfVectorizer()
    counts = vectorizer.fit_transform(examples).toarray()
    
    # Use previously defined selector
    global selector
    selector = SelectKBest(f_classif, k=number_of_features)
    counts_new = selector.fit_transform(counts, votes)
    
    ds = SupervisedDataSet(number_of_features, 1)
    
    for c, v in zip(counts_new, votes):
        ds.addSample(c, v)
        
    trainer = BackpropTrainer(model, ds)
    
    trainer.trainUntilConvergence()
    
    return model

In [11]:
# Takes a model and a testing example, and
# returns whether the model will correctly predict
# the testing example
def model_test(model, example):
    text = example.review
    vote = example.votes['useful'] > 0
    
    counts = vectorizer.transform([text]).toarray()
    counts_new = selector.transform(counts)
    
    prediction = model.activate(counts_new[0])
    
    return (prediction > 0.5) == vote

In [12]:
class Jole:
    def __init__(self):
        self.initialize_models = initialize_models
        self.train_model = train_model
        self.model_test = model_test
        
jole = Jole()

In [13]:
import main

In [14]:
main.engage(jole, filename='../data/smaller_reviews.json', stochastic=False, sample_size=100)

1. Results:
0/20 = 0.0%
2. Results:
5/20 = 0.25%
3. Results:
11/20 = 0.55%
4. Results:
15/20 = 0.75%
5. Results:
14/20 = 0.7%
Overall accuracy: 0.45
