In [None]:
#!/usr/bin/python

from argparse import ArgumentParser
from pan import ProfilingDataset
from tictacs import from_recipe
from sklearn.grid_search import GridSearchCV
#reload(pan.features)



log = []


def cross_val(dataset, task, model, num_folds=4):
    """ train and cross validate a model

    :lang: the language
    :task: the task we want to classify for , ex: age

    """

    #X, y = dataset.get_data(task)
    docs = createDocProfiles(dataset)
    X, y = create_target_prof_trainset(docs, task)
    del docs
    #return X
    # get parameters for grid search if it exists - else pass empty dict
    params = model.grid_params if hasattr(model, 'grid_params') else dict()
    print '\nCreating model for %s - %s' % (dataset.lang, task)
    print 'Using %s fold validation' % (num_folds)
    # get data
    #log.append('\nResults for %s - %s with classifier %s' %
    #           (dataset.lang, task, model.__class__.__name__))
    if task in dataset.config.classifier_list:
        grid_cv = GridSearchCV(model, params, cv=num_folds, verbose=1,
                               n_jobs=-1)
        grid_cv.fit(X, y)
        accuracy = grid_cv.best_score_
        log.append('best params: %s' % grid_cv.best_params_)
        log.append('Accuracy mean : %s' % accuracy)
    else:
        # if it's not, we measure mean square root error (regression)
        grid_cv = GridSearchCV(model, params, scoring='mean_squared_error',
                               cv=num_folds, verbose=1, n_jobs=-1)
        grid_cv.fit(X, y)
        accuracy = grid_cv.best_score_
        log.append('root mean squared error : %s' % accuracy)

if __name__ == '__main__':
    parser = ArgumentParser(description='Train a model with crossvalidation'
                            ' on pan dataset - used for testing purposes ')
    parser.add_argument('-i', '--input', type=str,
                        required=True, dest='infolder',
                        help='path to folder with pan dataset for a language')
    parser.add_argument('-n', '--numfolds', type=int,
                        dest='num_folds', default=4,
                        help='Number of folds to use in cross validation')

num_folds = 2
infolder = "./pan15-author-profiling-training-dataset-2015-04-23/pan15-author-profiling-training-dataset-english-2015-04-23/"

print('Loading dataset...')
dataset = ProfilingDataset(infolder)
print('Loaded %s users...\n' % len(dataset.entries))
config = dataset.config
tasks = config.tasks
print('\n--------------- Thy time of Running ---------------')
for task in tasks:
    if task == "age":
        tictac = from_recipe(config.recipes[task])
        z = cross_val(dataset, task, tictac, num_folds)
        # print results at end
        print('\n--------------- Thy time of Judgement ---------------')
    for message in log:
        print(message)

In [None]:
import pan
reload(pan.preprocess)
dataset = ProfilingDataset(infolder)
X, y = dataset.get_data(task)
b = [X[0][0:100]]
b.append(X[1][0:100])
print b

In [None]:
tictac.get_params

In [None]:
class DocProfile(object):
    
    """ Per Document Representation. Returns an instance of a document profile.
    
    """
    def __init__(self, entry, prof_id, doc_id):
        """ Initialization.
            -entry : contains most information. Comes from ProfilingDataset Class.
            -prof_id: index for intra-profile document position
            -doc_id: index for global documend indexing
        
        """
        
        self.userid = entry.userid
        self.lang = entry.lang
        self.media = entry.media
        self.gender = entry.gender
        self.age = entry.age
        self.prof_id = prof_id
        self.doc_id = doc_id
        self.text = entry.texts[prof_id]
        
    def __repr__(self):
        """ IPython friendly output
        :returns: str

        """
        # automatically capture all non iterables
        # (we want custom formatting for text list)
        attr_string = '\n'.join(['%s : %s' % (key, value)
                                 for key, value in self.__dict__.items()
                                 if not hasattr(value, '__iter__')])
        # print a snippet
        return attr_string
    
    def datafy(self, feature='none'):
        """Return a tuple of data - training and label if feature is not none

        :feature: the feature we want the label for
        :returns: tuple of data, label

        """
        if feature == 'none':
            return self.text
        else:
            return [self.text, self.__dict__[feature]]

def createDocProfiles(dataset):
    """ Create a list of the DocProfiles classes.
        -dataset: ProfilingDataset Object
        
        returns:
        -a list of DocProfile Objects
    """
    docs = []       
    doc_id = 0
    for entry in dataset.entries:
        for prof_id in range(0, len(entry.texts)):
            docs.append(DocProfile(entry, prof_id, doc_id))
            doc_id += 1
    return docs
    
def create_target_prof_trainset(docs, target_feature):
    """ Create a dataset according to train a specifici model regardin a certain feature.
        Like get_data() method from ProfilingDataset class.
        -docs: list of documents. Expects instances of class DocProfile. 
        -target_feature: filter feature
        
        returns:
        (X,y) : returns tuple - list of texts, list of labels 
        
    """
    wanted = []
    for doc in docs:
        if target_feature in doc.__dict__:
            wanted.append(doc.datafy(feature=target_feature))
        else:
            raise KeyError("task doesn't exist in DocProfile dic()")
    # zip produces tuples, we want to be able to modify
    # the contents in preprocessing in place
    # therefore we create we replace tuples with lists using map
    # returns tuple - list of texts, list of labels
    return map(list, zip(*wanted))

        
docs = createDocProfiles(dataset)

In [None]:
import numpy
a = [[0.25,0.25,0.25,0.25], [0.5,0,0.2,0.25], [0.2,0.3,0,0.5]]
b = [[1,0], [0,1],[1,0]]
numpy.dot(numpy.array(a).T,numpy.array(b))

In [None]:
task = 'age'
docs = createDocProfiles(dataset)
X, y = create_target_prof_trainset(docs, task)

In [None]:
from pan.misc import *
class SOA_Model2(object):


    """ Models that extracts Second Order Attributes (SOA) base on PAN 2013-2015 Winners"""

    def __init__(self):
        from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
        
        #stop_list = []
        #with open(stopwords_path, 'r') as stop_inp:
       # for w in stop_inp:
       # stop_list.append(w.replace("\n", ""))
        self.term_table = None
        self.labels = None
        #self.counter = CountVectorizer()
        self.counter = TfidfVectorizer(use_idf=False)

    def fit(self, X, y=None):
        import numpy
        from math import log
        
        if y:
            #tokens = [_twokenize.tokenizeRawTweetText(text) for text in X]
            #voc = set()
            #for token in tokens:
            #    voc = voc.union(token)
            #print len(voc)
            #print list(voc)[:100]
            parameters = {
                'input':'content', 
                'encoding':'utf-8', 
                'decode_error':'ignore', 
                #'vocabulary':list(voc),
                'tokenizer':lambda text:_twokenize.tokenizeRawTweetText(text)
                #'max_df':0.9,
                #'min_df':5
                #'max_features':20000
               }
            self.counter.set_params(**parameters) 
            #print "Oleeeeeeeeeeeeeeeeeeeeeeeeee"
            #print texts
            #print tokens
            #print list(voc)
            target_profiles = sorted(list(set(y)))
            print len(target_profiles)
            #return
            doc_term = self.counter.fit_transform(X)
            print "Doc_Terms"
            print doc_term.shape
            #return 
            #X1 = X.toarray()
            #X1 = X1.astype('float', casting='unsafe')
            target_profiles = sorted(list(set(y)))
            self.labels = target_profiles
            doc_prof = numpy.zeros([doc_term.shape[0], len(target_profiles)])
            for i in range(0, doc_term.shape[0]):
                tmp = numpy.zeros([1,len(target_profiles)])
                tmp[0, target_profiles.index(y[i])] = 1
                doc_prof[i,:] = tmp
            print "Doc_Prof"
            print doc_prof.shape
            term_prof = numpy.zeros([doc_term.shape[1], len(target_profiles)])
            term_prof = numpy.dot(numpy.log2(doc_term.toarray().astype('float', casting='unsafe').T + 1), doc_prof)
            print "Term_Prof"
            print term_prof.shape
            term_prof = term_prof / numpy.reshape(term_prof.sum(axis=1), (term_prof.sum(axis=1).shape[0], 1))
            #term_prof = term_prof / term_prof.sum(axis=0)
            self.term_table = term_prof
            print "GG"
            return self
    
    def transform(self, X):
        
        import numpy
        
        if self.labels==None:
            raise AttributeError('term_table was no found! Probably model was not fitted first. Run model.fit(X,y)!')
        else:
            doc_term = self.counter.transform(X)
            doc_prof = numpy.zeros([doc_term.shape[0], self.term_table.shape[1]])
            doc_prof = numpy.dot(doc_term.toarray().astype('float', casting='unsafe'), self.term_table)
            return doc_prof
            
    def predict(self, X):
        
        import numpy
        
        doc_prof = self.transform(X)
        y_pred = []
        for i in range(0, doc_prof.shape[0]):
            y_pred.append(self.labels[numpy.argmax(doc_prof[i])])
        return y_pred
        

In [None]:
from pan.misc import _twokenize
import pan
reload(pan)
c = SOA_Model2()
c.fit(X,y)

In [None]:
y_pred = c.predict(X)


In [None]:
for k, v in c.counter.vocabulary_.iteritems():
    if v>8000 and v< 9000:
        pass
        #print k, v
top_words = [[] for i in range(0, c.term_table.shape[1])]
cc= 0
for i in range(0, c.term_table.shape[0]):
    if max(c.term_table[i]) > 0.7:
        #print c.term_table[i], c.counter.vocabulary_.keys()[c.counter.vocabulary_.values().index(i)]
        top_words[list(c.term_table[i]).index(max(c.term_table[i]))].append(c.counter.vocabulary_.keys()[c.counter.vocabulary_.values().index(i)])
        cc += 1
top_words
        #c.term_table /= c.term_table[8411].sum(axis= 0)
#c.term_table[8411]

In [None]:
import numpy
tmp = numpy.zeros([2,4])
tmp[0,1]=1
tmp[0,3]= 1
tmp[1,2] = 1
tmp / numpy.reshape(tmp.sum(axis=1), (tmp.sum(axis=1).shape[0], 1))

In [None]:
import pan
reload(pan.features)
c = pan.features.SOA_Model2()
c.fit_transform(X, y)
a = ["I am very good!"]
#c.transform(X, y)
#from pprint import pprint
#pprint(dataset.get_data()[0])
#pprint(dataset.entries[0].texts[0])

In [None]:
X[0]

In [None]:
import numpy
aaa = numpy.asarray([[1, 2], [3, 4]], dtype=float)
bb = aaa.sum(axis=1)
print numpy.reshape(bb, (1,2))
print aaa/numpy.reshape(bb, (bb.shape[0],1))
print aaa/aaa.sum(axis=0)

In [None]:
import numpy
from sklearn.preprocessing import normalize
aaa = numpy.asarray([[1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=float)
print "sci-kit"
cc = normalize(aaa, axis=1, norm='l1')
print normalize(aaa, axis=1, norm='l1')
print normalize(cc, axis=0, norm='l1')
print numpy.sum(aaa,axis=1, keepdims=True)
print numpy.linalg.norm(aaa, axis=1)
aaa = numpy.true_divide(aaa, numpy.sum(aaa,axis=1, keepdims=True), dtype=float)
print numpy.sum(aaa,axis=0, keepdims=True)
aaa = numpy.true_divide(aaa, numpy.sum(aaa,axis=0, keepdims=True), dtype=float)
print aaa

In [None]:
#import pan
reload(pan.features)
log = []
#import logging
#log = logging.getLogger()
#log.setLevel(logging.INFO)
#log.addHandler(logging.StreamHandler())
infolder = "./pan15-author-profiling-training-dataset-2015-04-23/pan15-author-profiling-training-dataset-english-2015-04-23/"
modelfile
print('Loading dataset...')
dataset = ProfilingDataset(infolder)
print('Loaded %s users...\n' % len(dataset.entries))
config = dataset.config
tasks = config.tasks
all_models = {}
for task in tasks:
    print('Learning to judge %s..' % task)
    # load data
    if task == "age":
        X, y = dataset.get_data(task)
        tictac = from_recipe(config.recipes[task])
        all_models[task] = tictac.fit(X, y)
print('Writing model to {}'.format(modelfile))

In [None]:
import tictacs
tictacs.__file__

In [None]:
a = set()
a.add(1)
a.add(2)
a.add(3)
print(a[0:2])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
a = CountVectorizer(input=)

In [None]:
reload(pan.features)
c = pan.features.SOA_Model()
a = ["I am very good!"]
#aa = c.fit_transform(a)
print b
c.fit([b])
print aa
print c.counter.vocabulary_
kk = c.transform([b])
print kk

In [None]:
import gensim

documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
            "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]
 # remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
          for document in documents]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]
from pprint import pprint   # pretty-printer
pprint(texts)
dictionary = gensim.corpora.Dictionary(texts)

In [None]:
corpus = [dictionary.doc2bow(text) for text in texts]
model = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=5)
model.print_topics()

In [None]:
model.log_perplexity(corpus)

In [None]:
pow(2,11)

In [None]:
#!/usr/bin/python

from argparse import ArgumentParser
from pan import ProfilingDataset
from tictacs import from_recipe
from sklearn.grid_search import GridSearchCV
#reload(pan.features)



log = []


def cross_val(dataset, task, model, num_folds=4):
    """ train and cross validate a model

    :lang: the language
    :task: the task we want to classify for , ex: age

    """

    #X, y = dataset.get_data(task)
    docs = createDocProfiles(dataset)
    X, y = create_target_prof_trainset(docs, task)
    del docs
    #return X
    # get parameters for grid search if it exists - else pass empty dict
    params = model.grid_params if hasattr(model, 'grid_params') else dict()
    print '\nCreating model for %s - %s' % (dataset.lang, task)
    print 'Using %s fold validation' % (num_folds)
    # get data
    #log.append('\nResults for %s - %s with classifier %s' %
    #           (dataset.lang, task, model.__class__.__name__))
    if task in dataset.config.classifier_list:
        grid_cv = GridSearchCV(model, params, cv=num_folds, verbose=1,
                               n_jobs=-1)
        grid_cv.fit(X, y)
        accuracy = grid_cv.best_score_
        log.append('best params: %s' % grid_cv.best_params_)
        log.append('Accuracy mean : %s' % accuracy)
    else:
        # if it's not, we measure mean square root error (regression)
        grid_cv = GridSearchCV(model, params, scoring='mean_squared_error',
                               cv=num_folds, verbose=1, n_jobs=-1)
        grid_cv.fit(X, y)
        accuracy = grid_cv.best_score_
        log.append('root mean squared error : %s' % accuracy)

if __name__ == '__main__':
    parser = ArgumentParser(description='Train a model with crossvalidation'
                            ' on pan dataset - used for testing purposes ')
    parser.add_argument('-i', '--input', type=str,
                        required=True, dest='infolder',
                        help='path to folder with pan dataset for a language')
    parser.add_argument('-n', '--numfolds', type=int,
                        dest='num_folds', default=4,
                        help='Number of folds to use in cross validation')

num_folds = 2
infolder = "./pan15-author-profiling-training-dataset-2015-04-23/pan15-author-profiling-training-dataset-english-2015-04-23/"

print('Loading dataset...')
dataset = ProfilingDataset(infolder)
print('Loaded %s users...\n' % len(dataset.entries))
config = dataset.config
tasks = config.tasks
print('\n--------------- Thy time of Running ---------------')
for task in tasks:
    if task == "age":
        tictac = from_recipe(config.recipes[task])
        z = cross_val(dataset, task, tictac, num_folds)
        # print results at end
        print('\n--------------- Thy time of Judgement ---------------')
    for message in log:
        print(message)


all_models = {}
docs = createDocProfiles(dataset)
for task in tasks:
    if task =='age':
        print('Learning to judge %s..' % task)
        # load data
        X, y = create_target_prof_trainset(docs, task)
        #X, y = dataset.get_data(task)
        tictac = from_recipe(config.recipes[task])
        all_models[task] = tictac.fit(X, y)

In [None]:

from gensim import corpora, models, similarities

documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
          for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
import gensim

model =gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=100, minimum_probability=0)
a = model[corpus[0]]

In [None]:
lsi = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=2)
lsi[corpus[2]]
import numpy
#len(dictionary)

In [None]:
c = lsi[corpus]
l = [list(zip(*cc)[1]) for cc in c]
#l = []
#for cc in c:
    #print cc
#    l.append(list(zip(*cc)[1]))
print numpy.array(l)
    #print list(zip(*cc)[1])
    #for k in cc:
    #    print k

In [None]:
import pan.features
reload(pan.features)
pan.features.TWCNB.__dict__

In [None]:
import time
from argparse import ArgumentParser
from pan import ProfilingDataset, createDocProfiles, create_target_prof_trainset
from tictacs import from_recipe
from json import dumps
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score

log = []


def cross_val(dataset, task, model, num_folds=4):
    """ train and cross validate a model

    :lang: the language
    :task: the task we want to classify for , ex: age

    """

    # if (task != "age") and (task !="gender"):
    #    X, y = dataset.get_data(task)
    # else:
    #    docs = createDocProfiles(dataset)
    #    X, y = create_target_prof_trainset(docs, task)
    X, y = dataset.get_data(task)
    # y = [yy.lower() for yy in y]
    # get parameters for grid search if it exists - else pass empty dict
    params = model.grid_params if hasattr(model, 'grid_params') else dict()
    # from collections import Counter
    # import pprint
    # pprint.pprint(Counter(y))
    print '\nCreating model for %s - %s' % (dataset.lang, task)
    print 'Trainining instances: %s\n' % (len(X))
    print 'Using %s fold validation' % (num_folds)
    # get data
    log.append('\nResults for %s - %s with classifier %s' %
               (dataset.lang, task, model.__class__.__name__))
    if task in dataset.config.classifier_list:
        grid_cv = GridSearchCV(model, params, cv=num_folds, verbose=1,
                               n_jobs=-1)
        grid_cv.fit(X, y)
        # y_pred = grid_cv.best_estimator_.predict(X)
        # pprint.pprint(y_pred)
        # pprint.pprint(y)
        # conf = confusion_matrix(y, y_pred, labels=list(set(y)))
        accuracy = grid_cv.best_score_
        # accuracy2 = accuracy_score(y, y_pred)
        log.append('best params: %s' % grid_cv.best_params_)
        log.append('Accuracy mean : %s' % accuracy)
        import pprint
        pprint.pprint(grid_cv.grid_scores_)
        with open('./comb_res/res.txt', 'a') as out:
            out.write(' Results: %s - %s, params: %s ,Accuracy_Mean: %s\n' %
                      (dataset.lang, task,
                       dumps(grid_cv.best_params_), grid_cv.best_score_))
        # log.append('Best accuracy: {} '.format(accuracy2))
        # log.append('Best Confusion matrix :\n {}'.format(conf))
    else:
        # if it's not, we measure mean square root error (regression)
        raise KeyError('task %s was not found in task list!' % task)



infolder = '../DATA/pan16-author-profiling-training-dataset-2016-02-29/pan16-author-profiling-training-dataset-english-2016-02-29/'
num_folds = 3
time_start = time.time()
print('Loading dataset...')
dataset = ProfilingDataset(infolder)
print('Loaded %s users...\n' % len(dataset.entries))
config = dataset.config
tasks = config.tasks
print('\n--------------- Thy time of Running ---------------')
for task in tasks:
    tictac = from_recipe(config.recipes[task])
    import pprint
    #pprint.pprint(tictac.__dict__)
    #exit(1)
    steps = tictac.steps
    #print type(steps)
    outline = ""
    for step in steps:
        if step[0]=="features":
            # print type(step[1])
            for tf in step[1].transformer_list:
                #print type(tf[1])
                #print type(tf[1].get_params())
                outline += tf[0] + " with Params:[" + str(tf[1].get_params()) + "]+"
        else:
#            if hasattr(step[1], 'get_params'):
#                outline += step[0] + " with Params:[" + str(step[1].get_params()) + "]+"
#            else:
#                outline += step[0]+ "+"
            outline += step[0]+ "+"
    outline = outline[:-1] + "\n"
    print('Task:{}, Pipeline:{}'.format(task, outline))
    with open('./comb_res/res.txt', 'a') as out:
        out.write('Task:{}, Pipeline:{}'.format(task, outline))
    cross_val(dataset, task, tictac, num_folds)
# print results at end
print('\n--------------- Thy time of Judgement ---------------')
print ('Time: {} seconds.\n'.format(str(time.time()-time_start)))
with open('./comb_res/res.txt', 'a') as out:
    out.write('Time: {} seconds.\n'.format(str(time.time()-time_start)))
for message in log:
    print(message)


In [None]:
import dill
dill.pickles(tictac)
dill.detect.badtypes(tictac).__dict__.keys()

In [None]:
##### TRAIN ############


#!/usr/bin/python

import os
from argparse import ArgumentParser
from sklearn.externals import joblib
from tictacs import from_recipe
from pan import ProfilingDataset
import dill
import cPickle as pickle
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import accuracy_score, confusion_matrix


infolder = "../pan16-author-profiling-training-dataset-2016-02-29/pan16-author-profiling-training-dataset-english-2016-02-29/"
outfolder = "models/"
print('Loading dataset->Grouping User texts.\n')
dataset = ProfilingDataset(infolder)
print('Loaded {} users...\n'.format(len(dataset.entries)))
# get config
config = dataset.config
tasks = config.tasks
print('\n--------------- Thy time of Running ---------------')
all_models = {}
for task in tasks:
    print('Learning to judge %s..' % task)
    # load data
    X, y = dataset.get_data(task)
    tictac = from_recipe(config.recipes[task])
    all_models[task] = tictac.fit(X, y)
modelfile = os.path.join(outfolder, '%s2.bin' % dataset.lang)
print('Writing model to {}'.format(modelfile))
#fo = open(modelfile,  'wb')
#import pprint
#print type(all_models)
#print modelfile
#dill.dump(all_models, fo, protocol=pickle.HIGHEST_PROTOCOL)
#fo.close()
# pickle.dump(all_models, modelfile)
# dill.dump(all_models, modelfile)
joblib.dump(all_models, modelfile, compress=3)

In [None]:
import numpy

a = numpy.array([[1,2],[3,4]], dtype=float)
b = numpy.array([[0.1,0.2],[0.3,0.4]], dtype=float)
type(a[0,0])

In [None]:
a=1

In [None]:
a = numpy.array([1,2,3,4])
print a.shape
b = numpy.tile(a, (5, 1))
b

In [None]:
c = b.sum(axis=1)
print c.shape, type(c)

In [None]:
from sklearn.preprocessing import normalize
import pprint
pprint.pprint(a)
normalize(a, norm='l1', axis=1, copy=False)
pprint.pprint(a)

In [1]:
import os
from argparse import ArgumentParser
from sklearn.externals import joblib
from tictacs import from_recipe
from pan import ProfilingDataset
import dill
import cPickle as pickle
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import accuracy_score, confusion_matrix


infolder = "../DATA/pan16-author-profiling-training-dataset-2016-04-25/pan16-author-profiling-training-dataset-english-2016-02-29/"
outfolder = "models/"
print('Loading dataset->Grouping User texts.\n')
dataset = ProfilingDataset(infolder)
print('Loaded {} users...\n'.format(len(dataset.entries)))
# get config
config = dataset.config
tasks = config.tasks
print('\n--------------- Thy time of Running ---------------')
all_models = {}
for task in tasks:
    print('Learning to judge %s..' % task)
    # load data
    X, y = dataset.get_data(task)

Loading dataset->Grouping User texts.

Loaded 436 users...


--------------- Thy time of Running ---------------
Learning to judge age..
Learning to judge gender..


In [3]:
from sklearn.cross_validation import train_test_split
from collections import Counter
import pprint
print "Num of samples: " + str(len(y))
pprint.pprint(Counter(y))
X, y = dataset.get_data('age')
print len(X)

X, X_cv, X, y_cv = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_cv, X_test, y_cv, y_test = train_test_split(X_cv, y_cv, test_size=0.5, random_state=42, stratify=y_cv)

print len(X_cv), len(X_test), len(X) , len(X)+ len(X_cv) + len(X_test)
pprint.pprint(Counter(y))
pprint.pprint(Counter(y_cv))
pprint.pprint(Counter(y_test))

Num of samples: 436
Counter({'35-49': 182, '25-34': 140, '50-64': 80, '18-24': 28, '65-xx': 6})
436
87 87 262 436
Counter({'35-49': 182, '25-34': 140, '50-64': 80, '18-24': 28, '65-xx': 6})
Counter({'35-49': 36, '25-34': 28, '50-64': 16, '18-24': 6, '65-xx': 1})
Counter({'35-49': 37, '25-34': 28, '50-64': 16, '18-24': 5, '65-xx': 1})


In [43]:
#reload(preprocess)
#reload(features)
from pan import features
from pan import preprocess
X, y = dataset.get_data('age')
#X, y = dataset.get_data('gender')
print len(X)
#print X[0]
X = preprocess.preprocess(X)
#print "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$"
#print X[0]

436
    -Cleaning html
    -Detwittifying
    -Removing Numbers
    -Removing Punctuation
    -Removing Links


### 3grams+soa+soac

In [4]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

grams3 = TfidfVectorizer(analyzer='word', ngram_range=[3,3], max_features=5000, stop_words='english')
soa = features.SOA_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=None)
soac = features.SOAC_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=5000)
countTokens = features.CountTokens()
countHash = features.CountHash()
countUrls = features.CountURLs()
countReplies = features.CountReplies()
svm = SVC(kernel='rbf', C=1, gamma=1, class_weight='balanced', probability=True)
#svm = DecisionTreeClassifier()
combined = FeatureUnion([('3grams', grams3), ('soa', soa)])
pipe = Pipeline([('combined',combined), ('svm', svm)])
pipe.steps

[('combined', FeatureUnion(n_jobs=1,
         transformer_list=[('3grams', TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
          dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
          lowercase=True, max_df=1.0, max_features=5000, min_df=1,
          ngram_range=[3, 3], norm=u'l2', preprocessor=None, smo...ulary=None)), ('soa', SOA_Model2(max_df=1.0, max_features=None, min_df=5, tokenizer_var='sklearn'))],
         transformer_weights=None)),
 ('svm', SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

### Counts + SOA+SOAC. Ommit preprocess!!


In [71]:
reload(features)
features.SOAC_Model2.__doc__

' Complementary of SOA model 22'

In [5]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


grams3 = TfidfVectorizer(analyzer='word', ngram_range=[3,3], max_features=5000, stop_words='english')
countTokens = features.CountTokens()
countHash = features.CountHash()
countUrls = features.CountURLs()
countReplies = features.CountReplies()
soa = features.SOA_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=None)
soac = features.SOAC_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=5000)
scaler = StandardScaler()#MinMaxScaler()#StandardScaler()
#svm = DecisionTreeClassifier()
svm = SVC(kernel='rbf', C=0.1, gamma=1, class_weight='balanced', probability=True)
#combined = FeatureUnion([('soa', soa), ('soac', soac)])
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies), 
#                          ('soa', soa), ('soac', soac)])
combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
                         ('count_urls', countUrls), ('count_replies', countReplies)])
pipe = Pipeline([('3grams', grams3), ('svm', svm)])
#pipe = Pipeline([('soac',soac), ('svm', svm)])
#pipe = Pipeline([('combined',combined), ('svm', svm)])
pipe.steps

[('3grams',
  TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
          dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
          lowercase=True, max_df=1.0, max_features=5000, min_df=1,
          ngram_range=[3, 3], norm=u'l2', preprocessor=None, smooth_idf=True,
          stop_words='english', strip_accents=None, sublinear_tf=False,
          token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('svm', SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [None]:
### LDA

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from pan.features import LDA

LDAmodel = LDA(num_topics=30, lib='sklearn')
soa = features.SOA_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=None)
soac = features.SOAC_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=5000)
countTokens = features.CountTokens()
countHash = features.CountHash()
countUrls = features.CountURLs()
countReplies = features.CountReplies()
#svm = SVC(kernel='rbf', C=1, gamma=1, class_weight='balanced')
svm = DecisionTreeClassifier()
combined = FeatureUnion([('LDA', LDAmodel)])#, ('soa', soa), ('soac', soac)])
pipe = Pipeline([('combined',combined), ('svm', svm)])
pipe.steps

# Soft Voting

### AGE

In [45]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from pan.features import SOA_Model2

grams3 = TfidfVectorizer(analyzer='word', ngram_range=[3,3], max_features=5000, stop_words='english')
soa = features.SOA_Model2(max_df=1.0, min_df=1, tokenizer_var='sklearn', max_features=None)
combined = FeatureUnion([('3grams', grams3), ('soa', soa)])
svm = SVC(kernel='rbf', C=0.1, gamma=1, class_weight='balanced', probability=True)
pipe = Pipeline([('3grams+soa',combined), ('svm', svm)])
pipe.steps

[('3grams+soa', FeatureUnion(n_jobs=1,
         transformer_list=[('3grams', TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
          dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
          lowercase=True, max_df=1.0, max_features=5000, min_df=1,
          ngram_range=[3, 3], norm=u'l2', preprocessor=None, smo...ulary=None)), ('soa', SOA_Model2(max_df=1.0, max_features=None, min_df=1, tokenizer_var='sklearn'))],
         transformer_weights=None)),
 ('svm', SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [46]:
soac = features.SOAC_Model2(max_df=1.0, min_df=1, tokenizer_var='sklearn', max_features=None)
svm = SVC(kernel='rbf', C=10, gamma=1, class_weight='balanced', probability=True)
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies), 
#                          ('soa', soa), ('soac', soac)])
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies)])
pipe1 = Pipeline([('soac',soac), ('svm', svm)])
pipe1.steps

[('soac', SOAC_Model2(max_df=1.0, max_features=None, min_df=1, thres=0.1,
        tokenizer_var='sklearn')),
 ('svm', SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [47]:
from pan.features import LDA
LDAmodel = LDA(num_topics=30, lib='sklearn')
countTokens = features.CountTokens()
countHash = features.CountHash()
countUrls = features.CountURLs()
countReplies = features.CountReplies()
combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
                         ('count_urls', countUrls), ('count_replies', countReplies)])
svm = SVC(kernel='rbf', C=10, gamma=1, class_weight='balanced', probability=True)
#pipe2 = Pipeline([('counts',combined), ('svm', svm)])
pipe2 = Pipeline([('LDAmodel',LDAmodel), ('svm', svm)])
pipe2.steps

sklearn


[('LDAmodel', LDA(lib='sklearn', num_topics=30)),
 ('svm', SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

### Gender

In [11]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from pan.features import SOA_Model2

grams3 = TfidfVectorizer(analyzer='word', ngram_range=[3,3], max_features=5000, stop_words='english')
soa = features.SOA_Model2(max_df=1.0, min_df=1, tokenizer_var='sklearn', max_features=None)
#combined = FeatureUnion([('3grams', grams3), ('soa', soa)])
svm = SVC(kernel='rbf', C=10, gamma=1, class_weight='balanced', probability=True)
pipe = Pipeline([('3grams',grams3), ('svm', svm)])
pipe.steps

[('3grams',
  TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
          dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
          lowercase=True, max_df=1.0, max_features=5000, min_df=1,
          ngram_range=[3, 3], norm=u'l2', preprocessor=None, smooth_idf=True,
          stop_words='english', strip_accents=None, sublinear_tf=False,
          token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('svm', SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [10]:
from pan.features import SOAC_Model2
soac = features.SOAC_Model2(max_df=1.0, min_df=1, tokenizer_var='sklearn', max_features=None)
svm = SVC(kernel='rbf', C=0.1, gamma=1, class_weight='balanced', probability=True)
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies), 
#                          ('soa', soa), ('soac', soac)])
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies)])
pipe1 = Pipeline([('soac',soac), ('svm', svm)])
pipe1.steps

[('soac', SOAC_Model2(max_df=1.0, max_features=None, min_df=1, thres=0.1,
        tokenizer_var='sklearn')),
 ('svm', SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [9]:
from pan.features import LDA
LDAmodel = LDA(num_topics=120, lib='sklearn')
soa = features.SOA_Model2(max_df=1.0, min_df=1, tokenizer_var='sklearn', max_features=None)
countTokens = features.CountTokens()
countHash = features.CountHash()
countUrls = features.CountURLs()
countReplies = features.CountReplies()
combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
                         ('count_urls', countUrls), ('count_replies', countReplies)])
svm = SVC(kernel='rbf', C=0.1, gamma=1, class_weight='balanced', probability=True)
#pipe2 = Pipeline([('counts',combined), ('svm', svm)])
#pipe2 = Pipeline([('LDAmodel',LDAmodel), ('svm', svm)])
pipe2 = Pipeline([('soa',soa), ('svm', svm)])
pipe2.steps                             

sklearn


[('soa',
  SOA_Model2(max_df=1.0, max_features=None, min_df=1, tokenizer_var='sklearn')),
 ('svm', SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [48]:
import numpy, copy

def print_overlaps(predictions, names, verbose=True):
    N = len(names)
    res = numpy.zeros([N,N])
    for i in range(0, N):
        for j in range(i+1, N):
            #print i,j
            #predictions[i]
            #predictions[j]
            res[i,j] = len([(k,v) for k,v in zip(predictions[i], predictions[j]) if k==v])/float(len(predictions[0]))
            #print res[i,j]
            if verbose:
                print "%s - %s : %0.3f  overlap" % (names[i],  names[j], 100*res[i,j])
    return  res

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import VotingClassifier
from sklearn.grid_search import GridSearchCV
from pan.features import Metaclassifier
import time

#pipe = Pipeline([('3grams',grams3), ('svm', svm)])
#pipe1 = Pipeline([('soac',soac), ('svm', svm)])
#pipe2 = Pipeline([('soa',soa), ('svm', svm)])

X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=split, random_state=42, stratify=y)
X_meta, X_cv, y_meta, y_cv = train_test_split(X_cv, y_cv, test_size=0.5, stratify=y_cv)

print len(y_train), len(X_train)
for i, x in enumerate(X_train):
    if len(x)==0:
        X_train.remove(x)
        y_train.remove(y_train[i])
print len(y_train), len(X_train)


print len(y_cv), len(X_cv)
for i, x in enumerate(X_cv):
    if len(x)==0:
        X_cv.remove(x)
        y_cv.remove(y_cv[i])
print len(y_cv), len(X_cv)

print len(y_meta), len(X_meta)
for i, x in enumerate(X_meta):
    if len(x)==0:
        X_meta.remove(x)
        y_meta.remove(y_meta[i])
print len(y_meta), len(X_meta)    

print len(X_train),len(X_meta),len(X_cv),len(X_train)+len(X_cv)+len(X_meta), len(X)

eclf = VotingClassifier(estimators=[("0", pipe), ('1', pipe1), ("2", pipe2)], voting='soft')
eclfh = VotingClassifier(estimators=[("0", pipe), ('1', pipe1), ("2", pipe2)], voting='hard')
models = [pipe,pipe1,pipe2,eclf, eclfh]
model_names = ['3grams+soa', 'soac', 'lda', 'voting', 'votingh']
results = {}
for name in model_names:
    results[name] = {'pred': [], 'acc': [], 'conf': [], 'over': []}
results['space'] = {'pred': [], 'acc': [], 'conf': [], 'over':[]}
results['meta'] = {'pred': [], 'acc': [], 'conf': [], 'over':[]}
params = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100]}
params = {}
num_folds = 3
splits = [0.3, 0.4]
N = 4
t0 = time.time()
for split in splits:
    print "Split: " + str(split)  
    for i in xrange(N):
        #X, y = dataset.get_data('age')
        #X, y = dataset.get_data('gender')
        X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=split, stratify=y)
        for i, x in enumerate(X_train):
            if len(x)==0:
                X_train.remove(x)
                y_train.remove(y_train[i])
        for i, x in enumerate(X_cv):
            if len(x)==0:
                X_cv.remove(x)
                y_cv.remove(y_cv[i])
        if 'space' or 'meta' in results.keys():
            X_meta, X_cv, y_meta, y_cv = train_test_split(X_cv, y_cv, test_size=0.5, stratify=y_cv)
        print len(X_train), len(X_cv), len(X_cv) + len(X_train), len(X)
        trained_models = []
        for i, model in enumerate(models):
            if model_names[i] == 'voting' or model_names[i] == 'votingh':
                params = {}
            grid_search = GridSearchCV(estimator=model, param_grid=params, verbose=0, n_jobs=-1, cv=num_folds, refit=True)
            grid_search.fit(X_train,y_train)
            trained_models.append(grid_search.best_estimator_)
        predictions = []
        for i, model in enumerate(trained_models):
            predict = model.predict(X_cv)
            predictions.append(predict)
            results[model_names[i]]['pred'].append(predict)
            results[model_names[i]]['acc'].append(accuracy_score(y_cv, predict))
            results[model_names[i]]['conf'].append(confusion_matrix(y_cv, predict, labels=list(set(y))))
        # Space model ###
        models_for_space = {}
        cv_scores = []
        for name, model in zip(model_names, trained_models):
            if name!='voting' and name!='votingh':
                models_for_space[name] = model
                cv_scores.append(model.score(X_meta, y_meta))
        space = SubSpaceEnsemble3(models_for_space, cv_scores)
        space.fit(X_train + X_meta, y_train+y_meta)
        predict = space.predict(X_cv)
        #grid_search = GridSearchCV(space, param_grid={}, verbose=0, n_jobs=-1, cv=num_folds, refit=True)
        #grid_search.fit(X_meta+X_train, y_meta+y_train)
        #predict = grid_search.best_estimator_.predict(X_cv)
        results['space']['pred'].append(predict)
        results['space']['acc'].append(accuracy_score(y_cv, predict))
        results['space']['conf'].append(confusion_matrix(y_cv, predict, labels=list(set(y))))
        predictions.append(predict)
        # Space model end ###
        # Meta ###
        model_dic = {}
        for i, model in enumerate(trained_models):
            if model_names[i] != 'voting' and model_names[i] !='votingh': 
                model_dic[model_names[i]] = model
        Meta = Metaclassifier(models=model_dic, C=1.0, weights='balanced')
        Meta.fit(X_meta, y_meta)
        predict = Meta.predict(X_cv)
        results['meta']['pred'].append(predict)
        results['meta']['acc'].append(accuracy_score(y_cv, predict))
        results['meta']['conf'].append(confusion_matrix(y_cv, predict, labels=list(set(y))))
        predictions.append(predict)
        # Meta model END ###
        predictions.append(y_cv)
        results['3grams+soa']['over'].append(print_overlaps(predictions, model_names+['space', 'meta', 'true'], False))
    print('Split %0.1f.: %0.3f seconds') % (split, time.time()-t0)

262 262
259 259
87 87
86 86
87 87
87 87
259 87 86 432 436
Split: 0.3
301 66 367 436
sklearn
sklearn
sklearn
sklearn
sklearn
sklearn
sklearn
sklearn
sklearn
sklearnsklearn
sklearn
sklearn

sklearn
sklearn
sklearn
sklearn
sklearn
sklearnsklearn
sklearn
sklearn

sklearn
Fit took: 20.095 seconds
Predict took: 99.007 seconds
['18-24', '35-49', '50-64', '25-34', '65-xx']
['3grams+soa', 'lda', 'soac']
3grams+soa
lda
soac
(65, 15) (65,)
fit true
3grams+soa
lda
soac
Predict
array([1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1,
       1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1,
       1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1])
array(['25-34', '25-34', '25-34', '25-34', '25-34', '25-34', '35-49',
       '35-49', '25-34', '25-34', '25-34', '35-49', '25-34', '25-34',
       '25-34', '25-34', '35-49', '35-49', '25-34', '35-49', '25-34',
       '25-34', '25-34', '25-34', '35-49', '25-34', '25-34', '35-49',
       '25-34', '25-34', '35-4

In [188]:
results['3grams']['acc']

[0.37878787878787878, 0.34482758620689657]

In [158]:
import statistics
for i, split in enumerate(splits):
    print 'Split: %0.1f' % split
    print '----------- Scores-----------'
    for name in model_names + ['space'] + ['meta']:
        tmp = results[name]['acc'][N*i:(N*i+N)]
        print 
        print 'Model: %s Accuracy: %0.3f Std: %0.3f' % (name, statistics.mean(tmp), 
                                                          statistics.stdev(tmp))
        #tmp_conf = copy.deepcopy(results[name]['conf'][N*i])
        #for j in xrange(N*i+1, N*i+N):
        #    tmp_conf += results[name]['conf'][j]
        #tmp_conf /= N
        #print('Confusion matrix :\n {}'.format(tmp_conf))
    print '----------- Overlaps-----------'
    tmp_overlaps = copy.deepcopy(results['3grams+soa']['over'][N*i])
    for j in xrange(N*i+1, N*i+N):
            tmp_overlaps += results['3grams+soa']['over'][j]
    tmp_overlaps /= N
    print_names = model_names+['space', 'meta','true']
    for k in xrange(tmp_overlaps.shape[0]):
        for v in xrange(k+1, tmp_overlaps.shape[0]):
            print "%s - %s : %0.3f  overlap" % (print_names[k],  print_names[v], 100*tmp_overlaps[k, v])
    print '%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n'

Split: 0.3
----------- Scores-----------

Model: 3grams+soa Accuracy: 0.269 Std: 0.207

Model: soac Accuracy: 0.331 Std: 0.054

Model: lda Accuracy: 0.415 Std: 0.000

Model: voting Accuracy: 0.369 Std: 0.000

Model: votingh Accuracy: 0.338 Std: 0.109

Model: space Accuracy: 0.369 Std: 0.044

Model: meta Accuracy: 0.315 Std: 0.120
----------- Overlaps-----------
3grams+soa - soac : 24.615  overlap
3grams+soa - lda : 50.000  overlap
3grams+soa - voting : 36.154  overlap
3grams+soa - votingh : 79.231  overlap
3grams+soa - space : 33.846  overlap
3grams+soa - meta : 16.154  overlap
3grams+soa - true : 26.923  overlap
soac - lda : 40.000  overlap
soac - voting : 80.769  overlap
soac - votingh : 45.385  overlap
soac - space : 74.615  overlap
soac - meta : 68.462  overlap
soac - true : 33.077  overlap
lda - voting : 59.231  overlap
lda - votingh : 70.769  overlap
lda - space : 61.538  overlap
lda - meta : 30.769  overlap
lda - true : 41.538  overlap
voting - votingh : 56.923  overlap
voting -

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import VotingClassifier
from sklearn.grid_search import GridSearchCV


num_folds = 4
split = 0.2
#X, y = dataset.get_data('age')
#X, y = dataset.get_data('gender')
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=split, random_state=42, stratify=y)
print len(X_train), len(X_cv), len(X_cv) + len(X_train), len(X)
eclf = VotingClassifier(estimators=[("0", pipe), ('1', pipe1), ("2", pipe2)], voting='soft')
models = [pipe,pipe1,pipe2,eclf]
model_names = ['3grams', 'soac', 'lda', 'voting']
trained_models = []
for i, model in enumerate(models):
    grid_search = GridSearchCV(estimator=model, param_grid={}, verbose=1, n_jobs=-1, cv=num_folds, refit=True)
    grid_search.fit(X_train,y_train)
    print(grid_search.best_score_)
    print(grid_search.best_estimator_) 
    trained_models.append(grid_search.best_estimator_)

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class SubSpaceEnsemble(BaseEstimator, TransformerMixin):
    
    """ A Linear Weights Metaclassifier """

    def __init__(self, models, cv_scores):
        from sklearn.feature_extraction.text import CountVectorizer
        
        if (not models) or (not cv_scores):
            raise AttributeError('Models expexts a dictonary of models \
              containg the predictions of y_true for each classifier.\
              cv_score expects a list len(models.keys()) with the\
              cross validation scores of each model')
        else:
            self.models = models
            self.cv_scores = cv_scores
            self.ind2names = {}
            for i, name in enumerate(models.keys()):
                self.ind2names[i] = name
            self.counter = CountVectorizer()
            self.doc_terms = None
            self.experts = []
        

    def fit(self, X_cv, y_true=None, weights=None):
        
        import random

        if y_true is None:
            raise ValueError('we need y labels to supervise-fit!')
        else:
            parameters = {
                    'input': 'content',
                    'encoding': 'utf-8',
                    'decode_error': 'ignore',
                    'analyzer': 'word',
                    'stop_words': 'english',
                    # 'vocabulary':list(voc),
                    #'tokenizer': tokenization,
                    #'tokenizer': _twokenize.tokenizeRawTweetText,  # self.tokenization,
                    #'tokenizer': lambda text: _twokenize.tokenizeRawTweetText(nonan.sub(po_re.sub('', text))),
                    'max_df': 1.0,
                    'min_df': 1,
                    'max_features':None
                }
            self.counter.set_params(**parameters)
            self.doc_terms = self.counter.fit_transform(X_cv).toarray()
            predictions = []
            for name, model in self.models.iteritems():
                predictions.append(model.predict(X_cv))
            count = 0
            for i, y in enumerate(y_true):
                possible_experts = []
                for j, pred in enumerate(predictions):
                    if pred[i] == y:
                        possible_experts.append(j)
                if possible_experts:
                    possible_scores = [self.cv_scores[poss] for poss in possible_experts]
                    self.experts.append(possible_experts[possible_scores.index(max(possible_scores))])
                    count += 1
                else:
                    self.experts.append(self.cv_scores.index(max(self.cv_scores)))
            print "Chosen through expert: %0.2f" % (100*count/float(len(y_true))) 
            #print self.expert_scores
            #print self.experts
            return self

    def predict(self, X):

        # print "PRedict"
        # print X.shape
        X_transformed = self.counter.transform(X).toarray()
        #print type((X_transformed)[0])
        #print X_transformed.shape
        #return 0
        y_pred = []
        for i in range(0, X_transformed.shape[0]):
            #print X_transformed[i,:].shape
            best_model_ind = self.find_sim_projection(X_transformed[i,:])
            #print best_model_ind
            #print self.models[self.ind2names[best_model_ind]].predict([X[i]])[0]
            y_pred.append(self.models[self.ind2names[best_model_ind]].predict([X[i]])[0])
        #print y_pred
        return y_pred

    def score(self, X, y, sample_weight=None):

        from sklearn.metrics import accuracy_score
        return accuracy_score(y, self.predict(X), normalize=True)
        #return self.svc.score(self.transform_to_y(X), y, sample_weight)


    def find_sim_projection(self, x_sample):

        from sklearn.metrics.pairwise import cosine_similarity
        
        cos = []
        j = None
        min_s = -10000
        for i in range(0, self.doc_terms.shape[0]):
            #print x_sample.reshape(1,-1).shape
            #print self.doc_terms[i,:].reshape(1,-1).shape
            temp = cosine_similarity(x_sample.reshape(1,-1), self.doc_terms[i,:].reshape(1,-1))[0][0]
            if min_s < 0 or  temp > min_s:
                min_s = temp
                j = i
        return self.experts[j]

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin

class SubSpaceEnsemble2(BaseEstimator, TransformerMixin):
    
    """ A Linear Weights Metaclassifier """

    def __init__(self, models, cv_scores, k=10):
        from sklearn.feature_extraction.text import CountVectorizer
        
        if (not models) or (not cv_scores):
            raise AttributeError('Models expexts a dictonary of models \
              containg the predictions of y_true for each classifier.\
              cv_score expects a list len(models.keys()) with the\
              cross validation scores of each model')
        else:
            self.models = models
            self.cv_scores = cv_scores
            self.k = k
            self.ind2names = {}
            for i, name in enumerate(models.keys()):
                self.ind2names[i] = name
            self.counter = CountVectorizer()
            self.predictions = []
            self.true = []
            self.doc_terms = None
            self.tree = None
            self.experts = []
        

    def fit(self, X_cv, y_true=None, weights=None):
        
        from sklearn.neighbors import BallTree
        import random

        if y_true is None:
            raise ValueError('we need y labels to supervise-fit!')
        else:
            parameters = {
                    'input': 'content',
                    'encoding': 'utf-8',
                    'decode_error': 'ignore',
                    'analyzer': 'word',
                    'stop_words': 'english',
                    # 'vocabulary':list(voc),
                    #'tokenizer': tokenization,
                    #'tokenizer': _twokenize.tokenizeRawTweetText,  # self.tokenization,
                    #'tokenizer': lambda text: _twokenize.tokenizeRawTweetText(nonan.sub(po_re.sub('', text))),
                    'max_df': 1.0,
                    'min_df': 1,
                    'max_features':None
                }
            self.counter.set_params(**parameters)
            self.doc_terms = self.counter.fit_transform(X_cv).toarray()
            self.tree = BallTree(self.doc_terms, leaf_size=20)
            predictions = []
            for name, model in self.models.iteritems():
                predictions.append(model.predict(X_cv))
            self.predictions = predictions
            self.true = y_true
            count = 0
            #print self.expert_scores
            #print self.experts
            return self

    def predict(self, X, y_real):
        

        # print "PRedict"
        # print X.shape
        X_transformed = self.counter.transform(X).toarray()
        #print type((X_transformed)[0])
        #print X_transformed.shape
        #return 0
        y_pred = []
        for i in range(0, X_transformed.shape[0]):
            #print X_transformed[i,:].shape
            dist, neigbors_indexes = self.tree.query(X_transformed[i,:].reshape(1,-1), self.k)  
            print 'Sample ' + y_real[i]
            #print neigbors_indexes[0]
            #print dist
            #best_model_ind = self.expert_decision(neigbors_indexes[0])
            y_pred.append(self.expert_decision(neigbors_indexes[0],  X[i]))
            
            #y_pred.append(self.models[self.ind2names[best_model_ind]].predict([X[i]])[0])
        #print y_pred
        return y_pred

    def score(self, X, y, sample_weight=None):

        from sklearn.metrics import accuracy_score
        return accuracy_score(y, self.predict(X), normalize=True)
        #return self.svc.score(self.transform_to_y(X), y, sample_weight)


    def expert_decision(self, neigbors_indexes, x_sample):

        from sklearn.metrics import accuracy_score
        from collections import Counter
        
        models_pred = []
        acc = []
        neigbors_true = [self.true[n_i] for n_i in neigbors_indexes]
        print 'True'
        print neigbors_true
        sample_predictions = []
        for model_i in xrange(len(self.models.values())):
            model_pred = []
            for n_i in neigbors_indexes:
                model_pred.append(self.predictions[model_i][n_i])
            models_pred.append(model_pred)
            acc.append(accuracy_score(neigbors_true, model_pred, normalize=True))
            sample_predictions.append(self.models[self.ind2names[model_i]].predict(x_sample)[0])
            print 'Model: ' + self.ind2names[model_i] + ' Accuracy: ' + str(accuracy_score(neigbors_true, model_pred, normalize=True))
            print 'Predictions'
            print model_pred
            print 'Sample prediction: ' + str(sample_predictions[model_i])
        total_pred = []
        weights = {}
        weights['true'] = 3
        weights['models_n'] = [int(2/float((1-acc_m)+0.01)) for acc_m in acc]
        weights['models'] = [int(6/float((1-acc_m)+0.01)) for acc_m in acc]
        for i, model in enumerate(models_pred):
            if acc[i]>0.35:
                for k in model:
                    #print weights['models_n'][i]
                    total_pred.extend([k for j in xrange(weights['models_n'][i])])
                total_pred.extend([sample_predictions[model_i] for j in xrange(weights['models'][i])])
        for n in neigbors_true:
            total_pred.extend([n for j in xrange(weights['true'])])                     
        data = Counter(total_pred)
        #data = Counter([k for pred in models_pred for k in pred])
        print data
        best_model_ind = acc.index(max(acc))
        print 'Total pred: ' + str(data.most_common(1)[0][0])
        print len(total_pred)
        #return best_model_ind
        return data.most_common(1)[0][0]

In [57]:
a = [0,1,2,3]
b= [3,1]
a[b]

TypeError: list indices must be integers, not list

In [160]:
from sklearn.base import BaseEstimator, TransformerMixin
import time

class SubSpaceEnsemble3(BaseEstimator, TransformerMixin):
    
    """ A Linear Weights Metaclassifier """

    def __init__(self, models, cv_scores, k=3, weights= [6,3,2,0.7]):
        from sklearn.feature_extraction.text import CountVectorizer
        
        if (not models) or (not cv_scores):
            raise AttributeError('Models expexts a dictonary of models \
              containg the predictions of y_true for each classifier.\
              cv_score expects a list len(models.keys()) with the\
              cross validation scores of each model')
        else:
            self.models = models
            self.cv_scores = cv_scores
            self.k = k
            self.weights = weights
            self.ind2names = {}
            for i, name in enumerate(models.keys()):
                self.ind2names[i] = name
            self.counter = CountVectorizer()
            self.representations = []
            self.meta = None
            self.predictions = []
            self.true = []
            self.doc_terms = None
            self.tree = None
            self.experts = []
        

    def fit(self, X_cv, y_true=None, weights=None):
        
        from sklearn.neighbors import BallTree
        import random

        if y_true is None:
            raise ValueError('we need y labels to supervise-fit!')
        else:
            parameters = {
                    'input': 'content',
                    'encoding': 'utf-8',
                    'decode_error': 'ignore',
                    'analyzer': 'word',
                    'stop_words': 'english',
                    # 'vocabulary':list(voc),
                    #'tokenizer': tokenization,
                    #'tokenizer': _twokenize.tokenizeRawTweetText,  # self.tokenization,
                    #'tokenizer': lambda text: _twokenize.tokenizeRawTweetText(nonan.sub(po_re.sub('', text))),
                    'max_df': 1.0,
                    'min_df': 1,
                    'max_features':None
                }
            t0 = time.time()
            self.counter.set_params(**parameters)
            self.doc_terms = self.counter.fit_transform(X_cv).toarray()
            self.tree = BallTree(self.doc_terms, leaf_size=20)
            predictions = []
            for name, model in self.models.iteritems():
                predictions.append(model.predict(X_cv))
                #print len(predictions[-1])
                transf = model.steps[0][1].transform(X_cv)
                if hasattr(transf, "toarray"):
                    #print 'Exei'
                    self.representations.append(transf.toarray())
                else:
                    self.representations.append(transf)
            self.predictions = predictions
            self.true = y_true
            count = 0
            #print self.expert_scores
            #print self.experts
            print('Fit took: %0.3f seconds') % (time.time()-t0)
            return self

    def predict(self, X):
        

        # print "PRedict"
        # print X.shape
        X_transformed = self.counter.transform(X).toarray()
        #print type((X_transformed)[0])
        #print X_transformed.shape
        #return 0
        y_pred = []
        t0 = time.time()
        for i in range(0, X_transformed.shape[0]):
            #print X_transformed[i,:].shape
            dist, neigbors_indexes = self.tree.query(X_transformed[i,:].reshape(1,-1), self.k)  
            #print 'Sample ' + y_real[i]
            #print neigbors_indexes[0]
            #print dist
            #best_model_ind = self.expert_decision(neigbors_indexes[0])
            y_pred.append(self.expert_decision(neigbors_indexes[0],  X[i]))
            
            #y_pred.append(self.models[self.ind2names[best_model_ind]].predict([X[i]])[0])
        #print y_pred
        print('Predict took: %0.3f seconds') % (time.time()-t0)
        return y_pred

    def score(self, X, y, sample_weight=None):

        from sklearn.metrics import accuracy_score
        return accuracy_score(y, self.predict(X), normalize=True)
        #return self.svc.score(self.transform_to_y(X), y, sample_weight)


    def expert_decision(self, neigbors_indexes, x_sample):

        from sklearn.metrics import accuracy_score
        from collections import Counter
        from sklearn.neighbors import BallTree
        
        models_pred = []
        models_neig_pred = []
        acc = []
        t0 = time.time()
        neigbors_true = [self.true[n_i] for n_i in neigbors_indexes]
        #print('Neighbors per sample: %0.4f seconds') % (time.time()-t0)
        #print 'True'
        #print neigbors_true
        sample_predictions = []
        total_pred = []
        weights = {}
        weights['true'] = self.weights[1]
        weights['models_n'] = []
        weights['models'] = []
        for model_i in xrange(len(self.models.values())):
            ModelTree = BallTree(self.representations[model_i])
            temp_trans = self.models[self.ind2names[model_i]].steps[0][1].transform([x_sample])
            if hasattr(temp_trans, 'toarray'):
                temp_trans = temp_trans.toarray()
            _, model_neig = ModelTree.query(temp_trans, self.k)
            model_neig_pred = []
            for model_n_i in model_neig[0].tolist():
                model_neig_pred.append(self.predictions[model_i][model_n_i])
            models_neig_pred.append(model_neig_pred)
            model_pred = []
            for n_i in neigbors_indexes:
                model_pred.append(self.predictions[model_i][n_i])
            models_pred.append(model_pred)
            acc.append(accuracy_score(neigbors_true, model_pred, normalize=True))
            if acc[-1] >self.weights[3]:
                # Adding neighbors predictions
                weights['models_n'].append(int(self.weights[2]/float((1-acc[-1])+0.01)))
                total_pred.extend([pred for j in xrange(weights['models_n'][-1]) for pred in model_pred])
                #print('Predicting Neighbors per sample: %0.4f seconds') % (time.time()-t0)
                # Adding sample prediction
                sample_predictions.append(self.models[self.ind2names[model_i]].predict(x_sample)[0])
                weights['models'].append(int(self.weights[0]/float((1-acc[-1])+0.01))) 
                total_pred.extend([sample_predictions[-1] for j in xrange(weights['models'][-1])])
                total_pred.extend([pred for j in xrange(weights['models'][-1]) for pred in model_neig_pred])
            #print len(x_sample)
            #print self.ind2names[model_i]
            
                #print 'Model: ' + self.ind2names[model_i] + ' Accuracy: ' + str(accuracy_score(neigbors_true, model_pred, normalize=True))
                #print 'Predictions'
                #print model_pred
                #print 'Representations'
                #print model_neig_pred
                #print 'Sample prediction: ' + str(sample_predictions[-1])
        total_pred.extend([n for j in xrange(int(weights['true'])) for n in neigbors_true])
        #print('creating votes: %0.4f seconds') % (time.time()-t0)
        data = Counter(total_pred)
        #data = Counter([k for pred in models_pred for k in pred])
        #print data
        best_model_ind = acc.index(max(acc))
        #print 'Total pred: ' + str(data.most_common(1)[0][0])
        #print len(total_pred)
        #return best_model_ind
        return data.most_common(1)[0][0]

In [73]:
a = models['lda']
a.steps[0][1].transform(X[0])

array([[ 0.03333333,  0.03333333,  0.03333333, ...,  0.03333333,
         0.03333333,  0.03333333],
       [ 0.03333333,  0.03333333,  0.03333333, ...,  0.03333333,
         0.03333333,  0.03333333],
       [ 0.03333333,  0.03333333,  0.03333333, ...,  0.03333333,
         0.03333333,  0.03333333],
       ..., 
       [ 0.03333333,  0.03333333,  0.03333333, ...,  0.03333333,
         0.03333333,  0.03333333],
       [ 0.03333333,  0.03333333,  0.03333333, ...,  0.03333333,
         0.03333333,  0.03333333],
       [ 0.03333333,  0.03333333,  0.03333333, ...,  0.03333333,
         0.03333333,  0.03333333]])

In [98]:
hasattr(soa, 'transform')

True

In [149]:
models = {}
cv_scores = []
print len(y_cv), len(X_cv)
for i, x in enumerate(X_cv):
    if len(x)==0:
        X_cv.remove(x)
        y_cv.remove(y_cv[i])
print len(y_cv), len(X_cv)

print len(y_meta), len(X_meta)
for i, x in enumerate(X_meta):
    if len(x)==0:
        X_meta.remove(x)
        y_meta.remove(y_meta[i])
print len(y_meta), len(X_meta)        
print len(X_train), len(X_cv), len(X_cv) + len(X_train), len(X)

for name, model in zip(model_names, trained_models):
    if name!='voting' and name!='votingh' and name!='space' and name!='meta':
        models[name] = model
        cv_scores.append(model.score(X_meta, y_meta))
        
w = [1,1,1,0.35]
space = SubSpaceEnsemble3(models,cv_scores,k=10, weights=w)
space.fit(X_meta+X_train, y_meta+y_train)
predict = space.predict(X_cv)
acc = accuracy_score(y_cv, predict)
conf = confusion_matrix(y_cv, predict, labels=list(set(y_cv)))
print('Accuracy : {}'.format(acc))
print('Confusion matrix :\n {}'.format(conf))

87 87
87 87
86 86
86 86
262 87 349 436
Fit took: 17.170 seconds
Predict took: 197.758 seconds
Accuracy : 0.51724137931
Confusion matrix :
 [[ 1  2  0  2  0]
 [ 0 30  0  7  0]
 [ 0 13  0  3  0]
 [ 0 14  0 14  0]
 [ 0  1  0  0  0]]


In [None]:
a = numpy.array([[  0, 207,  65, 161,  11,  61, 152,  37, 302,  25]])
a[0].tolist()

In [41]:
import numpy as np
from scipy.optimize import minimize
from sklearn.metrics import accuracy_score
import time

global X_train, X_meta, x_cv, y_train, y_meta, y_cv

def f(w):
    print "Weights"
    print w
    space = SubSpaceEnsemble3(models,cv_scores,k=10, weights=w)
    space.fit(X_train + X_cv, y_train + y_cv)
    score = 1- accuracy_score(y_meta, space.predict(X_meta))
    print 'Score: ' + str(score)
    return score
    
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=split, random_state=42, stratify=y)
X_meta, X_cv, y_meta, y_cv = train_test_split(X_cv, y_cv, test_size=0.5, stratify=y_cv)

models = {}
cv_scores = []
print len(y_cv), len(X_cv)
for i, x in enumerate(X_cv):
    if len(x)==0:
        X_cv.remove(x)
        y_cv.remove(y_cv[i])
print len(y_cv), len(X_cv)

print len(y_meta), len(X_meta)
for i, x in enumerate(X_meta):
    if len(x)==0:
        X_meta.remove(x)
        y_meta.remove(y_meta[i])
print len(y_meta), len(X_meta)        
print len(X_train), len(X_cv), len(X_cv) + len(X_train), len(X)

for name, model in zip(model_names, trained_models):
    if name!='voting' and name!='space' and name!='meta':
        models[name] = model
        cv_scores.append(model.score(X_cv, y_cv))

print models.keys()
print cv_scores
#space = SubSpaceEnsemble(models, cv_scores)
w = [6,2,1,0.35]
bnds = ((0, None), (0, None), (0, None), (0, 1))
a = minimize(f, w, bounds=bnds)



88 88
88 88
88 88
87 87
260 88 348 436
['lda', 'soac', '3grams']
[0.79545454545454541, 0.88636363636363635, 0.88636363636363635]
Weights
[ 6.    2.    1.    0.35]
Fit took: 7.689 seconds
Predict took: 127.131 seconds
Score: 0.402298850575
Weights
[ 6.00000001  2.          1.          0.35      ]
Fit took: 7.803 seconds
Predict took: 128.827 seconds
Score: 0.402298850575
Weights
[ 6.          2.00000001  1.          0.35      ]
Fit took: 7.926 seconds
Predict took: 127.187 seconds
Score: 0.402298850575
Weights
[ 6.          2.          1.00000001  0.35      ]
Fit took: 7.761 seconds
Predict took: 127.905 seconds
Score: 0.402298850575
Weights
[ 6.          2.          1.          0.35000001]
Fit took: 8.134 seconds
Predict took: 128.011 seconds
Score: 0.402298850575


In [None]:
models = {}
cv_scores = []
for name, model in zip(model_names, trained_models):
    if name!='voting' and name!='space' and name!='meta':
        models[name] = model
        cv_scores.append(model.score(X_meta, y_meta))

print models.keys()
print cv_scores
space = SubSpaceEnsemble2(models, cv_scores)
space.fit(X_meta+X_train,y_meta+y_train)
predict = space.predict(X_cv, y_cv)
acc = accuracy_score(y_cv, predict)
conf = confusion_matrix(y_cv, predict, labels=list(set(y_cv)))
print('Accuracy : {}'.format(acc))
print('Confusion matrix :\n {}'.format(conf))

In [132]:
models['voting']

VotingClassifier(estimators=[('0', Pipeline(steps=[('3grams', TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=[3, 3], norm=u'l2', prepro...  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]))],
         voting='soft', weights=None)

In [183]:
models = {}
cv_scores = []
for name, model in zip(model_names, trained_models):
    if name!='voting' and name!='space' and name!='meta':
        models[name] = model
        cv_scores.append(model.score(X_cv, y_cv))

print models.keys()
print cv_scores
space = SubSpaceEnsemble(models, cv_scores)
grid_search = GridSearchCV(SubSpaceEnsemble(models, cv_scores), param_grid={}, verbose=1, n_jobs=-1, cv=num_folds, refit=True)
grid_search.fit(X_cv, y_cv)
space.fit(X_cv, y_cv)
y_space = grid_search.best_estimator_.predict(X_cv)

['lda', 'soac', '3grams']
[0.43678160919540232, 0.47126436781609193, 0.51724137931034486]
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Chosen through expert: 66.67
Chosen through expert: 66.67Chosen through expert: 63.79
Chosen through expert: 70.69
Chosen through expert: 65.52



[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   26.6s finished


In [171]:
len(y_space[0])

5

In [177]:
predict = y_space
acc = accuracy_score(y_cv, predict)
conf = confusion_matrix(y_cv, predict, labels=list(set(y)))
print('Accuracy : {}'.format(acc))
print('Confusion matrix :\n {}'.format(conf))

Accuracy : 0.563218390805
Confusion matrix :
 [[14 14  0  0  0]
 [ 3 34  0  0  0]
 [ 1 14  1  0  0]
 [ 1  4  0  0  0]
 [ 0  1  0  0  0]]


In [79]:
from sklearn.metrics.pairwise import cosine_similarity

a = numpy.array([1,2])
b = numpy.array([3,2])
cosine_similarity(a,b)[0][0]



0.86824314212445919

In [178]:
predictions = []
for i, model in enumerate(trained_models):
    predict = model.predict(X_cv)
    predictions.append(predict)
    acc = accuracy_score(y_cv, predict)
    conf = confusion_matrix(y_cv, predict, labels=list(set(y)))
    print('Accuracy : {}'.format(acc))
    print('Confusion matrix :\n {}'.format(conf))

Accuracy : 0.333333333333
Confusion matrix :
 [[ 0 17  0  8  3]
 [ 0 29  0  6  2]
 [ 0 11  0  4  1]
 [ 0  5  0  0  0]
 [ 0  1  0  0  0]]
Accuracy : 0.448275862069
Confusion matrix :
 [[11 17  0  0  0]
 [10 27  0  0  0]
 [ 1 14  1  0  0]
 [ 1  4  0  0  0]
 [ 0  1  0  0  0]]
Accuracy : 0.448275862069
Confusion matrix :
 [[12 16  0  0  0]
 [10 26  1  0  0]
 [ 1 14  1  0  0]
 [ 1  4  0  0  0]
 [ 0  1  0  0  0]]
Accuracy : 0.494252873563
Confusion matrix :
 [[15 13  0  0  0]
 [10 27  0  0  0]
 [ 2 13  1  0  0]
 [ 2  3  0  0  0]
 [ 0  1  0  0  0]]


In [15]:
import numpy, copy

def print_overlaps(predictions, names, verbose=True):
    N = len(names)
    res = numpy.zeros([N,N])
    for i in range(0, N):
        for j in range(i+1, N):
            #print i,j
            #predictions[i]
            #predictions[j]
            res[i,j] = len([(k,v) for k,v in zip(predictions[i], predictions[j]) if k==v])/float(len(predictions[0]))
            #print res[i,j]
            if verbose:
                print "%s - %s : %0.3f  overlap" % (names[i],  names[j], 100*res[i,j])
    return  res

#pred2 = copy.deepcopy(predictions)
#pred2.append(y_space)

#pred2.append(y_cv)
#model_names = ['3grams', 'soac', 'lda', 'voting']
#model_names += ['space']
#model_names += ['True']
#print len([(i, j) for i,j in zip(predictions[0], predictions[1]) if i==j])/float(len(predictions[0]))
#print_overlaps(pred2, model_names)

In [48]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
svm = SVC(kernel='rbf', C=10, gamma=1, class_weight='balanced', probability=True)

#clf = AdaBoostClassifier(base_estimator=svm, n_estimators=100, learning_rate=1.0, algorithm='SAMME.R', random_state=42)

clf = BaggingClassifier(base_estimator = svm, n_estimators=100, verbose=1, random_state=42)

#X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=split, random_state=42, stratify=y)
print len(X_train), len(X_cv), len(X_cv) + len(X_train), len(X)
X_train_new = soac.transform(X_train)
#for i, x in enumerate(X_train):
#    if len(x)<=1 or y_train[i]<=1:
#        print 'y'
#        X_train.remove(x)
#        y_train.remove(y_train[i])
print len(X_train), len(y_train)
clf.fit(X_train_new,y_train)
predict= clf.predict(soac.transform(X_cv))
acc = accuracy_score(y_cv, predict)
conf = confusion_matrix(y_cv, predict, labels=list(set(y)))
print('Accuracy : {}'.format(acc))
print('Confusion matrix :\n {}'.format(conf))

344 88 432 436
We are transforming!
Doc_prof
(344, 5) <type 'numpy.ndarray'>
344 344
We are transforming!
Doc_prof
(88, 5) <type 'numpy.ndarray'>


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s finished


ValueError: operands could not be broadcast together with shapes (88,5) (88,4) (88,5) 

In [28]:
import pandas, copy
import matplotlib.pyplot as plt
pred2 = copy.deepcopy(predictions)
pred2.append(y_cv)
pred2 = map(list, zip(*pred2))
df = pandas.DataFrame(pred2, columns=model_names)
df.describe()

Unnamed: 0,3grams,soac,lda,voting,True
count,88,88,88,88,88
unique,5,4,1,4,5
top,35-49,35-49,35-49,35-49,35-49
freq,51,52,88,56,37


In [110]:
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import plotly.plotly as py
import plotly.tools as tls
from plotly.graph_objs import *

tls.set_credentials_file(username="Bogas",
                             api_key="9s60rarm2w")

py.sign_in(username="Bogas", api_key="9s60rarm2w")

pred2 = copy.deepcopy(predictions)
pred2.append(y_cv)
traces = []
model_names += ['True']
for i, pred in enumerate(pred2):
    traces.append(Scatter(
        x=range(0,len(y_cv)),
        y=pred,
        mode='markers+line',
        type= 'scatter',
        name= model_names[i]
        )
                )

title1 = "Results on test set for Ensemble Scheme"
layout = Layout(
        width= 1200,
        height= 800,
        title= title1,
        xaxis = {"title": 'Samples'},
        yaxis = {"title": 'Classes', "type":'category'}
)

data = Data(traces)
fig = Figure(data=data, layout=layout)
#py.plot(fig, filename='Grey_70_cosine_vector_list bow')
py.iplot(fig, filename=title1)

In [109]:
help(YAxis)

Help on class YAxis in module plotly.graph_objs.graph_objs:

class YAxis(PlotlyDict)
 |  A dictionary-like object for representing a y-axis in plotly.
 |  
 |  Online examples:
 |  
 |      https://plot.ly/python/axes/
 |      https://plot.ly/python/multiple-axes/
 |      https://plot.ly/python/subplots/
 |      https://plot.ly/python/insets/
 |  
 |  Parent key:
 |  
 |      yaxis
 |  
 |  Quick method reference:
 |  
 |      YAxis.update(changes)
 |      YAxis.strip_style()
 |      YAxis.get_data()
 |      YAxis.to_graph_objs()
 |      YAxis.validate()
 |      YAxis.to_string()
 |      YAxis.force_clean()
 |  
 |  Valid keys:
 |  
 |      title [required=False] (value=a string):
 |          The y-axis title.
 |  
 |      titlefont [required=False] (value=Font object | dictionary-like object):
 |          Links a dictionary-like object describing the font settings of the
 |          y-axis title.
 |  
 |          For more, run `help(plotly.graph_objs.Font)`
 |  
 |      range [require

In [62]:
# predict class probabilities for all classifiers
probas = [c.predict_proba(X_cv) for c in trained_models]

We are transforming!
Doc_prof
(88, 5) <type 'numpy.ndarray'>
We are transforming!
Doc_prof
(88, 5) <type 'numpy.ndarray'>


In [74]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from scipy import interp
import numpy as np

y_cv2 = label_binarize(y_cv, list(set(y)))
pred2 = []
for pred in predictions:
    pred2.append(label_binarize(pred, list(set(y))))

n_classes = len(list(set(y)))
plt.figure()    
for j, model in enumerate(trained_models):
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(len(list(set(y)))):
        fpr[i], tpr[i], _ = roc_curve(y_cv2[:, i], pred2[j][:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    # Compute macro-average ROC curve and ROC area
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Plot all ROC curves

    plt.plot(fpr["macro"], tpr["macro"],
         label=model_names[j]+' macro-area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         linewidth=2)

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristics to multi-class for ensemble methods')
plt.legend(loc="lower right")
plt.show()

In [None]:
LDAA = grid_search.best_estimator_.steps[0][1].__dict__['transformer_list'][0][1]
def print_top_words(model, feature_names, n_top_words):

    for topic_idx, topic in enumerate(model.components_):
        topic_words = " ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]])
        print("#%d: " % topic_idx + topic_words)
        #print(" ".join([feature_names[i]
        #                for i in topic.argsort()[:-n_top_words - 1:-1]]))
#print_top_words(LDAA.LDA, LDAA.counter.get_feature_names(), 10)

def get_top_words(model, feature_names, n_top_words):
     
    feat = []
    for topic_idx, topic in enumerate(model.components_):
        topic_words = " ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]])
        feat.append("#%d: " % topic_idx + topic_words)
        #print("#%d: " % topic_idx + topic_words)
    return feat
get_top_words(LDAA.LDA, LDAA.counter.get_feature_names(), 10)

In [None]:
feature_names = get_top_words(LDAA.LDA, LDAA.counter.get_feature_names(), 10)
print len(feature_names)
#soa_feat_names = ["soa_prob_"+str(i) for i in range(0, len(set(y)))]
#soac_feat_names = ["soac_prob_"+str(i) for i in range(0, len(set(y)))]
#feature_names += soa_feat_names
#feature_names += soac_feat_names
feature_names = [feat.encode('utf-8') for feat in feature_names]
print len(feature_names)

### Counts + soa + Soac Features

In [None]:
len(X)

In [None]:

import copy
feature_names = copy.deepcopy(countTokens.l)
feature_names += ['numHash', 'numUrl', 'numRep']
#soa_feat_names = ["soa_prob_"+str(i) for i in range(0, len(set(y)))]
#soac_feat_names = ["soac_prob_"+str(i) for i in range(0, len(set(y)))]
#feature_names += soa_feat_names
#feature_names += soac_feat_names
feature_names = [feat.encode('utf-8') for feat in feature_names]
print len(countTokens.l), len(feature_names)

In [None]:
reload(features)
#features.SOAC_Model2.__doc__
soac = features.SOAC_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=5000)
#y[0:10]

In [None]:
XX = [#"I like playing video games very much :).", 
     #"Football games are the best!",
     #"Being young forever is very funny and entertaining",
     # "Football games are the best!",
      "best games",
      "best games",
     #"World leaders should gather and decide for todays meeting!",
     #"Problems nowadays seem to thrive everywhere",
     #"Just got off from work today! Weekend is coming though, so it's alright...",
     #"This weekend we are going of for 3 days..",
     " Weekend alright...",
     " Weekend alright...",
     " Weekend alright...",
     "Awful weather",
     "Awful weather",
     "Awful weather",
     "Awful weather",
     "Awful weather"]
yy = ["18-24",
     "18-24",
     "25-34",
     "25-34",
     "25-34",
     "35-49",
     "35-49",
     "35-49",
     "35-49",
     "35-49",
    ]
#reload(preprocess)
#reload(features)
from pan import features
from pan import preprocess
from sklearn.grid_search import GridSearchCV
XX = preprocess.preprocess(XX)
num_folds = 2
grid_search = GridSearchCV(estimator=pipe, param_grid={}, verbose=1, n_jobs=-1, cv=num_folds, refit=True)
grid_search.fit(XX,yy)
print(grid_search.best_estimator_)
print(grid_search.best_score_)

In [None]:
xx = soac.fit_transform(X[0:10], y[0:10])
print xx
print y[0:10]

In [6]:
soac = features.SOAC_Model2(max_df=1.0, min_df=1, tokenizer_var='sklearn', max_features=None)
combined = FeatureUnion([('soac', soac)])
svm = SVC(kernel='rbf', C=0.1, gamma=1, class_weight='balanced', probability=True)
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies), 
#                          ('soa', soa), ('soac', soac)])
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies)])
pipe1 = Pipeline([('combined',combined), ('svm', svm)])
pipe1.steps

[('combined', FeatureUnion(n_jobs=1,
         transformer_list=[('soac', SOAC_Model2(max_df=1.0, max_features=None, min_df=1, thres=0.1,
        tokenizer_var='sklearn'))],
         transformer_weights=None)),
 ('svm', SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [35]:
svm = SVC(kernel='rbf', C=0.1, gamma=1, class_weight='balanced', probability=True)
soa = features.SOA_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=None)
combined = FeatureUnion([('soa', soa)])
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies), 
#                          ('soa', soa), ('soac', soac)])
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies)])
pipe2= Pipeline([('combined',combined), ('svm', svm)])
pipe2.steps

[('combined', FeatureUnion(n_jobs=1,
         transformer_list=[('soa', SOA_Model2(max_df=1.0, max_features=None, min_df=5, tokenizer_var='sklearn'))],
         transformer_weights=None)),
 ('svm', SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [None]:
from sklearn.grid_search import GridSearchCV

num_folds = 4
grid_search1 = GridSearchCV(estimator=pipe1, param_grid={}, verbose=1, n_jobs=-1, cv=num_folds, refit=True)
grid_search1.fit(X,y)
print(grid_search1.best_estimator_)
print(grid_search1.best_score_)
grid_search2 = GridSearchCV(estimator=pipe2, param_grid={}, verbose=1, n_jobs=-1, cv=num_folds, refit=True)
grid_search2.fit(X,y)
print(grid_search2.best_estimator_)
print(grid_search2.best_score_)

In [59]:
from sklearn.grid_search import GridSearchCV

num_folds = 4
grid_search = GridSearchCV(estimator=pipe1, param_grid={}, verbose=1, n_jobs=-1, cv=num_folds, refit=True)
grid_search.fit(X,y)
print(grid_search.best_estimator_)
print(grid_search.best_score_)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
We are fitting!
[[ 15.57142857   3.11428571   2.3956044    5.45        72.66666667]]We are fitting!
We are fitting!
We are fitting!
We are fitting!
[[ 15.52380952   3.1047619    2.39705882   5.43333333  81.5       ]][[ 15.52380952   3.1047619    2.39705882   5.43333333  81.5       ]][[ 15.61904762   3.12380952   2.39416058   5.46666667  65.6       ]][[ 15.61904762   3.12380952   2.39416058   5.46666667  65.6       ]]



We are transforming!We are transforming!
We are transforming!
We are transforming!

Doc_profDoc_profDoc_profDoc_prof



(326, 5) <type 'numpy.ndarray'>
(328, 5) <type 'numpy.ndarray'>
(328, 5) <type 'numpy.ndarray'>
(326, 5) <type 'numpy.ndarray'>
We are transforming!
We are transforming!
We are transforming!
We are transforming!
Doc_profDoc_profDoc_profDoc_prof



(110, 5) <type 'numpy.ndarray'>
(108, 5) <type 'numpy.ndarray'>
(108, 5) <type 'numpy.ndarray'>
(110, 5) <type 'numpy.ndarray'>

We are transforming!

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    7.0s finished


In [8]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import VotingClassifier
from sklearn.grid_search import GridSearchCV


num_folds = 4
split = 0.2
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=split, random_state=42, stratify=y)
print len(X_train), len(X_cv), len(X_cv) + len(X_train), len(X)
eclf = VotingClassifier(estimators=[("0", pipe), ('1', pipe1)], voting='soft')
trained_models = []
for model in [pipe, pipe1, eclf]:
    grid_search = GridSearchCV(estimator=model, param_grid={}, verbose=1, n_jobs=-1, cv=num_folds, refit=True)
    grid_search.fit(X_train,y_train)
    print(grid_search.best_score_)
    print(grid_search.best_estimator_) 
    trained_models.append(grid_search.best_estimator_)

348 88 436 436
Fitting 4 folds for each of 1 candidates, totalling 4 fits
We are fitting!
Doc_TermsWe are fitting!
We are fitting!
We are fitting!
We are fitting!
Doc_TermsDoc_TermsDoc_TermsDoc_Terms



(259, 13537)
(261, 14138)
(262, 14058)
(262, 13356)
Doc_Prof
Doc_Prof
Doc_Prof
Doc_Prof
(259, 5) <type 'numpy.ndarray'>
(261, 5) <type 'numpy.ndarray'>
(262, 5) <type 'numpy.ndarray'>
(262, 5) <type 'numpy.ndarray'>
Doc_Term
Doc_Term
Doc_Term
Doc_Term
(259, 13537) <class 'scipy.sparse.csr.csr_matrix'>
(261, 14138) <class 'scipy.sparse.csr.csr_matrix'>
(262, 14058) <class 'scipy.sparse.csr.csr_matrix'>
(262, 13356) <class 'scipy.sparse.csr.csr_matrix'>
Term_Prof
Term_Prof
Term_Prof
Term_Prof
(13537, 5) <type 'numpy.ndarray'>
(14138, 5) <type 'numpy.ndarray'>
(14058, 5) <type 'numpy.ndarray'>
(13356, 5) <type 'numpy.ndarray'>
Random Term_Prof
Random Term_Prof
Random Term_Prof
Random Term_Prof
[ 0.          0.44151325  0.06550793  0.49297881  0.        ]
[ 0.          0.72245332  0.2775466

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   13.3s finished


We are fitting!
[[ 15.81818182   3.10714286   2.4          5.4375      69.6       ]]We are fitting!
We are fitting!
We are fitting!
We are fitting!
[[ 16.1875       3.08333333   2.39814815   5.39583333  86.33333333]][[ 16.3125       3.10714286   2.39449541   5.4375      65.25      ]][[ 15.41176471   3.11904762   2.40366972   5.45833333  65.5       ]][[ 15.41176471   3.11904762   2.40366972   5.45833333  65.5       ]]



We are transforming!
We are transforming!
We are transforming!
We are transforming!
Doc_profDoc_profDoc_profDoc_prof



(259, 5) <type 'numpy.ndarray'>
(261, 5) <type 'numpy.ndarray'>
(262, 5) <type 'numpy.ndarray'>
(262, 5) <type 'numpy.ndarray'>
We are transforming!
We are transforming!
We are transforming!
We are transforming!
Doc_profDoc_profDoc_profDoc_prof



(89, 5) <type 'numpy.ndarray'>
(87, 5) <type 'numpy.ndarray'>
(86, 5) <type 'numpy.ndarray'>
(86, 5) <type 'numpy.ndarray'>

We are transforming!
Doc_prof
(348, 5) <type 'numpy.ndarray'>
0.454022988506
Pipeli

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    4.0s finished


We are fitting!
Doc_TermsWe are fitting!
We are fitting!
We are fitting!
We are fitting!
Doc_TermsDoc_TermsDoc_TermsDoc_Terms



(259, 13537)
(261, 14138)
(262, 14058)
(262, 13356)
Doc_Prof
Doc_Prof
Doc_Prof
Doc_Prof
(259, 5) <type 'numpy.ndarray'>
(261, 5) <type 'numpy.ndarray'>
(262, 5) <type 'numpy.ndarray'>
(262, 5) <type 'numpy.ndarray'>
Doc_Term
Doc_Term
Doc_Term
Doc_Term
(259, 13537) <class 'scipy.sparse.csr.csr_matrix'>
(261, 14138) <class 'scipy.sparse.csr.csr_matrix'>
(262, 14058) <class 'scipy.sparse.csr.csr_matrix'>
(262, 13356) <class 'scipy.sparse.csr.csr_matrix'>
Term_Prof
Term_Prof
Term_Prof
Term_Prof
(13537, 5) <type 'numpy.ndarray'>
(14138, 5) <type 'numpy.ndarray'>
(14058, 5) <type 'numpy.ndarray'>
(13356, 5) <type 'numpy.ndarray'>
Random Term_Prof
Random Term_Prof
Random Term_Prof
Random Term_Prof
[ 0.          0.44151325  0.06550793  0.49297881  0.        ]
[ 0.          0.72245332  0.27754668  0.          0.        ]
[ 0.          0.34643333  0.17271636  0.4808503

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.8s finished


In [9]:
predictions = []
for model in trained_models:
    predict = model.predict(X_cv)
    predictions.append(predict)
    acc = accuracy_score(y_cv, predict)
    conf = confusion_matrix(y_cv, predict, labels=list(set(y)))
    print('Accuracy : {}'.format(acc))
    print('Confusion matrix :\n {}'.format(conf))

We are transforming!
Doc_Terms
(88, 16756) <class 'scipy.sparse.csr.csr_matrix'>
SOA Transform:
Doc_prof
(88, 5) <type 'numpy.ndarray'>
[ 3.1563402   3.32837876  3.16486118  2.96139769  1.89792746]
Len Voc: 16756
Accuracy : 0.534090909091
Confusion matrix :
 [[15 12  1  0  0]
 [ 6 29  2  0  0]
 [ 4 10  2  0  0]
 [ 3  2  0  1  0]
 [ 0  1  0  0  0]]
We are transforming!
Doc_prof
(88, 5) <type 'numpy.ndarray'>
Accuracy : 0.511363636364
Confusion matrix :
 [[ 9 17  0  2  0]
 [ 2 33  0  2  0]
 [ 1 14  1  0  0]
 [ 2  2  0  2  0]
 [ 0  1  0  0  0]]
We are transforming!
Doc_Terms
(88, 16756) <class 'scipy.sparse.csr.csr_matrix'>
SOA Transform:
Doc_prof
(88, 5) <type 'numpy.ndarray'>
[ 3.1563402   3.32837876  3.16486118  2.96139769  1.89792746]
Len Voc: 16756
We are transforming!
Doc_prof
(88, 5) <type 'numpy.ndarray'>
Accuracy : 0.568181818182
Confusion matrix :
 [[15 12  1  0  0]
 [ 5 32  0  0  0]
 [ 5  9  2  0  0]
 [ 3  2  0  1  0]
 [ 0  1  0  0  0]]


In [30]:
predictions[2]

array(['35-49', '25-34', '25-34', '25-34', '35-49', '50-64', '35-49',
       '35-49', '35-49', '25-34', '35-49', '25-34', '50-64', '35-49',
       '35-49', '25-34', '25-34', '35-49', '35-49', '35-49', '25-34',
       '25-34', '35-49', '25-34', '25-34', '35-49', '35-49', '35-49',
       '35-49', '35-49', '35-49', '25-34', '35-49', '25-34', '25-34',
       '25-34', '35-49', '25-34', '35-49', '35-49', '25-34', '25-34',
       '35-49', '35-49', '25-34', '25-34', '35-49', '35-49', '35-49',
       '25-34', '50-64', '25-34', '35-49', '25-34', '35-49', '25-34',
       '25-34', '25-34', '25-34', '35-49', '35-49', '25-34', '25-34',
       '18-24', '35-49', '35-49', '25-34', '35-49', '50-64', '25-34',
       '35-49', '35-49', '25-34', '50-64', '35-49', '35-49', '35-49',
       '35-49', '18-24', '35-49', '35-49', '35-49', '25-34', '25-34',
       '25-34', '18-24', '50-64', '50-64', '35-49', '25-34', '35-49',
       '35-49', '25-34', '25-34', '35-49', '35-49', '25-34', '25-34',
       '25-34', '25-

### 3grams + soa + Soac Features

In [62]:
#feature_names = grid_search.best_estimator_.steps[0][1].__dict__['transformer_list'][0][1].get_feature_names()
#print len(set(y))
feature_names = []
soa_feat_names = ["soa_prob_"+str(i) for i in range(0, len(set(y)))]
soac_feat_names = ["soac_prob_"+str(i) for i in range(0, len(set(y)))]
#feature_names += soa_feat_names
feature_names += soac_feat_names
print len(feature_names)
feature_names = [feat.encode('utf-8') for feat in feature_names]

5


In [63]:
a = grid_search.best_estimator_.steps[0][1]
#print a.transform(X).shape

In [64]:
import pandas as pd
data = pd.DataFrame(a.transform(X), columns=feature_names)
data["class"] = y
print(data.describe())

We are transforming!
Doc_prof
(436, 5) <type 'numpy.ndarray'>
       soac_prob_0  soac_prob_1  soac_prob_2  soac_prob_3  soac_prob_4
count   436.000000   436.000000   436.000000   436.000000   436.000000
mean      0.334800     0.333926     0.323147     0.335459     0.333121
std       0.245069     0.370890     0.390482     0.307089     0.231141
min       0.000000     0.000000     0.000000     0.000000     0.000000
25%       0.189950     0.000000     0.000000     0.141166     0.190086
50%       0.289293     0.261512     0.263432     0.289500     0.284627
75%       0.428288     0.512928     0.523927     0.448032     0.394424
max       1.692613     2.068010     3.497002     2.356757     1.905468


In [None]:
soacc = a.transformer_list[0][1]
voc = soacc.counter.vocabulary_
print 'Voc: ' + str(len(voc))
print soacc.term_table.shape
#terms= ['marriage', 'pension']
#graph_matrix = numpy.zeros([len(terms), soacc.term_table.shape[1]])
j = 0 
for term, index in voc.iteritems():
    l = list(soacc.term_table[index,:])
    if l.index(min(l))==3 and  min(l)<0.02 and min(l)!=0:
        print term
        print l
        j += 1
    if j==1000:
        break

In [113]:
import plotly.plotly as py
import plotly.graph_objs as go
import numpy
py.sign_in('Bogas', '9s60rarm2w')
soacc = a.transformer_list[0][1]
voc = soacc.counter.vocabulary_
print 'Voc: ' + str(len(voc))
print soacc.term_table.shape
terms= ['dreamjob','lol', 'mortgage', 'booksellers', 'juvenile']
graph_matrix = numpy.zeros([len(terms), soacc.term_table.shape[1]])
j = 0
for term in terms:
    idx = voc[term]
    print term
    print soacc.term_table[idx,:]
    graph_matrix[j, :] = soacc.term_table[idx,:]
    j += 1
    #plt.bar(numpy.arange(soacc.term_table.shape[1]), soacc.term_table[idx,:], color='r')
    #plt.show()

data = []
names = sorted(list(set(y)))
for i in range(0, soacc.term_table.shape[1]):
    data.append(
        go.Bar(
        x=terms,
        y=graph_matrix[:, i],
        name=names[i]
    )
    )
layout = go.Layout(
    barmode='group'
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
#plot_url = py.plot(fig, filename='grouped-bar')

Voc: 125532
(125532, 5)
dreamjob
[ 0.01265381  0.26222892  0.31307008  0.22406249  0.18798469]
lol
[ 0.19276541  0.11180198  0.27522844  0.2184271   0.20177707]
mortgage
[ 0.2011231   0.2456243   0.14738652  0.20176227  0.2041038 ]
booksellers
[ 0.20404603  0.2865569   0.3084714   0.00853856  0.1923871 ]
juvenile
[ 0.19876243  0.27913675  0.27862587  0.22337202  0.02010292]


In [10]:
import pandas as pd
from numpy.random import randint
import matplotlib.pyplot as plt

grouped = data.groupby('class')
rowlength = grouped.ngroups/2                         # fix up if odd number of groups
fig, axs = plt.subplots(figsize=(9,4), 
                        nrows=2, ncols=rowlength,     # fix as above
                        gridspec_kw=dict(hspace=0.4)) # Much control of gridspec

targets = zip(grouped.groups.keys(), axs.flatten())
print targets
grouped.get_group('18-24').hist(alpha=0.4)
#for i, (key, ax) in enumerate(targets):
#    ax.plot(grouped.get_group(key))
#    ax.set_title('a=%s'%str(key))
#ax.legend()
#plt.show()

[('25-34', <matplotlib.axes._subplots.AxesSubplot object at 0x7f62e40e63d0>), ('35-49', <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6f0c290>), ('50-64', <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6eac590>), ('18-24', <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6ebddd0>)]


array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6e33390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6e05c10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6d8c310>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6d02290>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6cd9790>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6c4f810>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6bad110>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6b26350>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6bd6990>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6a799d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c69f4950>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c69cfe50>]], dtype=object)

In [None]:
grouped = data.groupby('class')
grouped.mean().T

In [77]:
### BAR PLOTS OF MEAN VALUE OF FEATURES FOR EACH CLASS ######

grouped = data.groupby('class')
plt.figure()
grouped.mean().T.plot(kind='bar', figsize=(60,10))
plt.savefig('test1.png')
plt.show()

In [None]:
##### Distribution over a feature for each class #####

In [None]:
import matplotlib.pyplot as plt
grouped = data.groupby('class')
import numpy
ncol = 4 # pick one dimension
nrow = (len(feature_names)+ ncol-1) / ncol # make sure enough subplots
#fig, ax = plt.subplots(nrows=nrow, ncols=ncol, figsize=(6,6)) # create the axes
j = 0
for key in list(data.columns.values):
#    ix = numpy.unravel_index(j, ax.shape)
#    print ix
    print key
    if key!='class':
        j += 1
        plt.figure(j, figsize=(10,10))
        grouped[key].plot(kind='kde', alpha=0.8, legend=grouped.groups.keys(), title=key)
    #g = grouped[key]
    #print grouped[key].mean()
    #if j==1:
    #    tmp = g.mean()
    #else:
    #    print g.mean()
    #    tmp.append(g.mean())
    #print tmp
        plt.show()
    #if j==2:
    #    break
#tmp
    #break
    #ax[ix] = grouped[key].plot(kind='kde', alpha=0.4, legend=grouped.groups.keys())
    #break
#for key in grouped.keys:
#    grouped[key].plot(kind='kde', alpha=0.4, legend=grouped.groups.keys())
#for key in grouped.groups.keys():
#    b = grouped.get_group(key)
#    b.plot('kin')

In [None]:
import numpy
ncol = 4 # pick one dimension
nrow = (len(feature_names)+ ncol-1) / ncol # make sure enough subplots
fig, ax = plt.subplots(nrows=nrow, ncols=ncol) # create the axes
j = 0
for i in feature_names: 
    ix = numpy.unravel_index(j, ax.shape)
    #print ix
    j += 1
    ax[ix] = data.groupby('class').i.hist(alpha=0.4)   # go over a linear list of data # compute an appropriate index (1d or 2d)
    #feat = feature_names[i]
    #data.groupby('class').feat.hist(alpha=0.4, ax=ax[i])
import matplotlib.pyplot as plt
#%matplotlib outline
plt.savefig('CameraEvolution.png', bbox_inches='tight')
plt.show()

In [None]:
clf = grid_search.best_estimator_.steps[1][1]
#import pydot
import pyparsing

#reload(pydot)

In [None]:
import pprint, numpy
from operator import itemgetter

feat_importance = zip(list(numpy.array(feature_names)[numpy.nonzero(clf.feature_importances_)]), list(clf.feature_importances_[numpy.nonzero(clf.feature_importances_)]))
feat_importance = sorted(feat_importance, key=itemgetter(1))[::-1]
feat_importance
#for i in zip(list(numpy.array(feature_names)[numpy.nonzero(clf.feature_importances_)]), list(clf.feature_importances_([numpy.nonzero(clf.feature_importances_)]))):
#    i

In [None]:
>>> with open("iris.dot", 'w') as f:
    f = tree.export_graphviz(clf, out_file=f, feature_names=feature_names,
                         filled=True, rounded=True,  
                         special_characters=True)
#>>> import os
#>>> os.unlink('iris.dot')

In [None]:
>>> from sklearn.externals.six import StringIO
from sklearn import tree
import pydot
>>> from IPython.display import Image  
>>> dot_data = StringIO()  
>>> tree.export_graphviz(clf,  out_file=dot_data,
                         feature_names=feature_names,
                         filled=True, rounded=True,  
                         special_characters=True)  
>>> graph = pydot.graph_from_dot_data(dot_data.getvalue())  
#>>> Image(graph.create_png())   
>>> graph.write_pdf("iris.pdf") 