### Tensorflow model outline
#### Definitions phrase
* Decide on an architecture
* Define all variables as tensors
* Define how to generate outputs from your inputs and variables
* Define a cost function with respect to your predictions and you labels
* Define an optimizer that minimizes your cost function
#### Execution phase
* create an execution session
* Initialize your variables
* over n epochs, run the optimizer, feeding it some data in batches

In [1]:

import tensorflow as tf
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split


class MLP(object):
    def __init__(self, X, y, layer_size = 1000):
        
        """
        32-512 rows per batch
        """
        self.X = X
        self.y = y
        self.n_classes = len(np.unique(y))
        self.hidden_dim = layer_size
        self.input_dim = X_train.shape[1]
        self.model_path = 'model.chkpt'
        self.saver = None
        self.graph = tf.Graph()
        self.default_dtype = tf.float64
        
        with self.graph.as_default():
            with tf.variable_scope('mlp_model') as scope:
                self.learning_rate = tf.Variable(0.0, dtype=tf.float32, trainable=False)
                self.x_input = tf.placeholder(X.dtype, shape = (None, self.input_dim))
                self.y_output = tf.placeholder(X.dtype, shape = (None, self.n_classes))
                self.weights = {
                    'weights1':tf.get_variable('weights1', (self.input_dim,self.hidden_dim ), dtype=self.default_dtype), 
                    'bias1':tf.get_variable('bias1', (self.hidden_dim, ), dtype=self.default_dtype), 
                    'weights2':tf.get_variable('weights2', (self.hidden_dim, self.n_classes ), dtype=self.default_dtype), 
                    'bias2':tf.get_variable('bias2', (self.n_classes, ), dtype=self.default_dtype)}
                self.get_logit_op = self.feed_forward(self.x_input, self.weights)
                self.predict_proba_op = tf.sigmoid(self.get_logit_op)
                self.predict_op = tf.argmax(self.predict_proba_op, axis=1)
                self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.y_output, logits=self.get_logit_op))
                self.optimizer = tf.train.AdamOptimizer().minimize(self.loss)            
    
    def feed_forward(self, x_input, weights):
        hidden = tf.matmul(x_input, weights['weights1'])
        hidden = tf.add(hidden, weights['bias1'])
        hidden = tf.nn.relu(hidden)
        output = tf.matmul(hidden, weights['weights2'])
        output = tf.add(output, weights['bias2'])
        return output

    def predict(self, X):
        
        with tf.Session(graph=self.graph) as sess:
            self.saver.restore(sess, self.model_path)
            preds = sess.run(self.predict_op, feed_dict={self.x_input:X})
        return preds
            
    def predict_proba(self, X, session):
        with tf.Session(graph=self.graph) as sess:
            self.saver.restore(sess, self.model_path)
            preds = sess.run(self.predict_proba_op, feed_dict={self.x_input:X})
        return preds
    
    def fit(self, epochs=100):
        
        epochs = range(epochs)
  
        with tf.Session(graph=self.graph) as sess:
            self.saver = tf.train.Saver()
            for var in self.graph.get_collection('variables'):
                sess.run(var.initializer)
                
            for epoch in epochs:
                sess.run(self.optimizer, feed_dict={self.x_input: self.X, 
                                                    self.y_output: self.y})
            
            self.saver.save(sess, self.model_path)


In [2]:
from sklearn.metrics import classification_report
cancer = load_breast_cancer()
X = cancer.data
y = label_binarize(cancer.target, classes=[0,1,2])[:, :2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)            
c = MLP(X, y)
c.fit()
print(classification_report(np.argmax(y_test, axis=1), c.predict(X_test)))

INFO:tensorflow:Restoring parameters from model.chkpt
             precision    recall  f1-score   support

          0       0.66      0.99      0.79        70
          1       0.98      0.64      0.78       101

avg / total       0.85      0.78      0.78       171



In [3]:
from sklearn.neural_network import MLPClassifier
gb = MLPClassifier(hidden_layer_sizes=(1000, ))
gb.fit(X_train, np.argmax(y_train, axis=1))
print("Train: ", np.mean(gb.predict(X_train) == np.argmax(y_train, axis=1)))
print("Test: ", np.mean(gb.predict(X_test) == np.argmax(y_test, axis=1)))

Train:  0.927135678392
Test:  0.87134502924


In [7]:
#!pip install gensim >> gensim-log.txt
#!pip install spacy >> spacy-log.txt
#!python -m spacy download en >> spacy-download.txt
#!pip install keras
#!pip install tensorflow
from gensim.models import Word2Vec
from sklearn.datasets import fetch_20newsgroups
from spacy.tokens import Doc
import spacy
from spacy.matcher import Matcher
from spacy.attrs import ORTH, IS_PUNCT
from collections import OrderedDict
from functools import partial


class TextProcesser(object):
    def __init__(self, nlp=None, max_len=200, max_vocab_size=20000):
        
        self.max_vocab_size = max_vocab_size
        self.max_len = max_len
        self.nlp = nlp or spacy.load('en')
        self.PADDING_VAL = 0
        self.MISSING_VAL = 1
        self.START_VAL = 2
        self.END_VAL = 3
        self.INDEX_OFFSET = 4
        self.vocab = OrderedDict()
        
    def pad(self, obj):
        n_pads = max(self.max_len - len(obj), 0)
        result = obj[:self.max_len] + [self.PADDING_VAL] * n_pads
        result[-1] = self.END_VAL
        return result
        
    def get_current_vocab_size(self):
        return len(self.vocab)
        
    def check_word(self, word):
        current_vocab_size = self.get_current_vocab_size() # 0
        if current_vocab_size <= self.max_vocab_size:
            if word not in self.vocab:
                self.vocab.update({word: current_vocab_size + self.INDEX_OFFSET}) #{'apple': 0}
        try:
            return self.vocab[word]
        except KeyError:
            return self.MISSING_VAL
        
    def __call__(self, corpus, merge_ents=True):
        docs = []
        if merge_ents:
            for doc in self.nlp.pipe(corpus, parse=False):
                for ent in doc.ents:
                    ent.merge()
                tokens = list(map(self.process_token, doc[:self.max_len]))
                tokens = [self.START_VAL] + tokens + [self.END_VAL]
                docs.append(self.pad(tokens))
        else:
            for doc in self.nlp.pipe(corpus, parse=False, tag=False, entity=False):
                tokens = list(map(self.process_token, doc[:self.max_len]))
                docs.append(self.pad(tokens))
        
        return docs
  
    def process_token(self, token):
        if token.like_url:
            return self.check_word("URL")
        elif token.like_email:
            return self.check_word("EMAIL")
        elif token.like_num:
            return self.check_word("NUM")
        else:
            return self.check_word(token.lower_)




In [8]:
#nlp = spacy.load('en')
dataset = fetch_20newsgroups()
corpus = dataset.data
processor = TextProcesser(nlp=nlp, max_len=100)
processed_corpus = processor(corpus, merge_ents=False)

In [None]:
from tensorflow.contrib.rnn import BasicLSTMCell
import tensorflow as tf
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()
X = cancer.data
y = label_binarize(cancer.target, classes=[0,1,2])[:, :2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

class RNN(object):
    def __init__(self, X, y, max_seq_len, word_embedding_size=300):
        
        """
        32-512 rows per batch
        """
        self.X = X
        self.y = y
        self.n_classes = len(np.unique(y))
        self.hidden_dim = layer_size
        self.max_seq_len = max_seq_len or () #len of X
        self.model_path = 'model.chkpt'
        self.saver = None
        self.graph = tf.Graph()
        self.default_dtype = tf.float64
        self.vocab_size
        with self.graph.as_default():
            with tf.variable_scope('rnn_model') as scope:
                self.learning_rate = tf.Variable(0.0, dtype=tf.float32, trainable=False)
                self.x_input = tf.placeholder(X.dtype, shape = (None, self.max_seq_len))
                self.y_output = tf.placeholder(X.dtype, shape = (None, self.n_classes))
                
                self.hidden_state = self.initial_state = tf.Variable(shape=(self.initial_state))
                
                #grab the vector associated with a given word
                self.U = tf.Variable(shape=(self.vocab_size, word_embedding_size))
                
                # tranform the previous state
                self.W = tf.Variable(shape=(word_embedding_size, word_embedding_size))
                
                #the bias
                self.bias1 = tf.Variable(shape= (word_embedding_size, ))
                
                
                self.get_logit_op = self.feed_forward(self.x_input)
                self.predict_proba_op = tf.sigmoid(self.get_logit_op)
                self.predict_op = tf.argmax(self.predict_proba_op, axis=1)
                self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.y_output, logits=self.get_logit_op))
                self.optimizer = tf.train.AdamOptimizer().minimize(self.loss)            
    
    def feed_forward(self, word_input):
        current_word_vector = tf.nn.embedding_lookup(self.U, word_inputs)
        previous_state_vector = 
        output, self.lstm_state = self.lstm(hidden, self.lstm_state)
        return output

    def predict(self, X):
        
        with tf.Session(graph=self.graph) as sess:
            self.saver.restore(sess, self.model_path)
            preds = sess.run(self.predict_op, feed_dict={self.x_input:X})
        return preds
            
    def predict_proba(self, X, session):
        with tf.Session(graph=self.graph) as sess:
            self.saver.restore(sess, self.model_path)
            preds = sess.run(self.predict_proba_op, feed_dict={self.x_input:X})
        return preds
    
    def fit(self, epochs=100):
        
        epochs = range(epochs)
  
        with tf.Session(graph=self.graph) as sess:
            self.saver = tf.train.Saver()
            for var in self.graph.get_collection('variables'):
                sess.run(var.initializer)
                
            for epoch in epochs:
                sess.run(self.optimizer, feed_dict={self.x_input: self.X, 
                                                    self.y_output: self.y})
            
            self.saver.save(sess, self.model_path)

In [None]:
tf.contrib.learn.run_n?