In [None]:
#!/usr/bin/python

from argparse import ArgumentParser
from pan import ProfilingDataset
from tictacs import from_recipe
from sklearn.grid_search import GridSearchCV
#reload(pan.features)



log = []


def cross_val(dataset, task, model, num_folds=4):
    """ train and cross validate a model

    :lang: the language
    :task: the task we want to classify for , ex: age

    """

    #X, y = dataset.get_data(task)
    docs = createDocProfiles(dataset)
    X, y = create_target_prof_trainset(docs, task)
    del docs
    #return X
    # get parameters for grid search if it exists - else pass empty dict
    params = model.grid_params if hasattr(model, 'grid_params') else dict()
    print '\nCreating model for %s - %s' % (dataset.lang, task)
    print 'Using %s fold validation' % (num_folds)
    # get data
    #log.append('\nResults for %s - %s with classifier %s' %
    #           (dataset.lang, task, model.__class__.__name__))
    if task in dataset.config.classifier_list:
        grid_cv = GridSearchCV(model, params, cv=num_folds, verbose=1,
                               n_jobs=-1)
        grid_cv.fit(X, y)
        accuracy = grid_cv.best_score_
        log.append('best params: %s' % grid_cv.best_params_)
        log.append('Accuracy mean : %s' % accuracy)
    else:
        # if it's not, we measure mean square root error (regression)
        grid_cv = GridSearchCV(model, params, scoring='mean_squared_error',
                               cv=num_folds, verbose=1, n_jobs=-1)
        grid_cv.fit(X, y)
        accuracy = grid_cv.best_score_
        log.append('root mean squared error : %s' % accuracy)

if __name__ == '__main__':
    parser = ArgumentParser(description='Train a model with crossvalidation'
                            ' on pan dataset - used for testing purposes ')
    parser.add_argument('-i', '--input', type=str,
                        required=True, dest='infolder',
                        help='path to folder with pan dataset for a language')
    parser.add_argument('-n', '--numfolds', type=int,
                        dest='num_folds', default=4,
                        help='Number of folds to use in cross validation')

num_folds = 2
infolder = "./pan15-author-profiling-training-dataset-2015-04-23/pan15-author-profiling-training-dataset-english-2015-04-23/"

print('Loading dataset...')
dataset = ProfilingDataset(infolder)
print('Loaded %s users...\n' % len(dataset.entries))
config = dataset.config
tasks = config.tasks
print('\n--------------- Thy time of Running ---------------')
for task in tasks:
    if task == "age":
        tictac = from_recipe(config.recipes[task])
        z = cross_val(dataset, task, tictac, num_folds)
        # print results at end
        print('\n--------------- Thy time of Judgement ---------------')
    for message in log:
        print(message)

In [None]:
import pan
reload(pan.preprocess)
dataset = ProfilingDataset(infolder)
X, y = dataset.get_data(task)
b = [X[0][0:100]]
b.append(X[1][0:100])
print b

In [None]:
tictac.get_params

In [None]:
class DocProfile(object):
    
    """ Per Document Representation. Returns an instance of a document profile.
    
    """
    def __init__(self, entry, prof_id, doc_id):
        """ Initialization.
            -entry : contains most information. Comes from ProfilingDataset Class.
            -prof_id: index for intra-profile document position
            -doc_id: index for global documend indexing
        
        """
        
        self.userid = entry.userid
        self.lang = entry.lang
        self.media = entry.media
        self.gender = entry.gender
        self.age = entry.age
        self.prof_id = prof_id
        self.doc_id = doc_id
        self.text = entry.texts[prof_id]
        
    def __repr__(self):
        """ IPython friendly output
        :returns: str

        """
        # automatically capture all non iterables
        # (we want custom formatting for text list)
        attr_string = '\n'.join(['%s : %s' % (key, value)
                                 for key, value in self.__dict__.items()
                                 if not hasattr(value, '__iter__')])
        # print a snippet
        return attr_string
    
    def datafy(self, feature='none'):
        """Return a tuple of data - training and label if feature is not none

        :feature: the feature we want the label for
        :returns: tuple of data, label

        """
        if feature == 'none':
            return self.text
        else:
            return [self.text, self.__dict__[feature]]

def createDocProfiles(dataset):
    """ Create a list of the DocProfiles classes.
        -dataset: ProfilingDataset Object
        
        returns:
        -a list of DocProfile Objects
    """
    docs = []       
    doc_id = 0
    for entry in dataset.entries:
        for prof_id in range(0, len(entry.texts)):
            docs.append(DocProfile(entry, prof_id, doc_id))
            doc_id += 1
    return docs
    
def create_target_prof_trainset(docs, target_feature):
    """ Create a dataset according to train a specifici model regardin a certain feature.
        Like get_data() method from ProfilingDataset class.
        -docs: list of documents. Expects instances of class DocProfile. 
        -target_feature: filter feature
        
        returns:
        (X,y) : returns tuple - list of texts, list of labels 
        
    """
    wanted = []
    for doc in docs:
        if target_feature in doc.__dict__:
            wanted.append(doc.datafy(feature=target_feature))
        else:
            raise KeyError("task doesn't exist in DocProfile dic()")
    # zip produces tuples, we want to be able to modify
    # the contents in preprocessing in place
    # therefore we create we replace tuples with lists using map
    # returns tuple - list of texts, list of labels
    return map(list, zip(*wanted))

        
docs = createDocProfiles(dataset)

In [None]:
import numpy
a = [[0.25,0.25,0.25,0.25], [0.5,0,0.2,0.25], [0.2,0.3,0,0.5]]
b = [[1,0], [0,1],[1,0]]
numpy.dot(numpy.array(a).T,numpy.array(b))

In [None]:
task = 'age'
docs = createDocProfiles(dataset)
X, y = create_target_prof_trainset(docs, task)

In [None]:
from pan.misc import *
class SOA_Model2(object):


    """ Models that extracts Second Order Attributes (SOA) base on PAN 2013-2015 Winners"""

    def __init__(self):
        from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
        
        #stop_list = []
        #with open(stopwords_path, 'r') as stop_inp:
       # for w in stop_inp:
       # stop_list.append(w.replace("\n", ""))
        self.term_table = None
        self.labels = None
        #self.counter = CountVectorizer()
        self.counter = TfidfVectorizer(use_idf=False)

    def fit(self, X, y=None):
        import numpy
        from math import log
        
        if y:
            #tokens = [_twokenize.tokenizeRawTweetText(text) for text in X]
            #voc = set()
            #for token in tokens:
            #    voc = voc.union(token)
            #print len(voc)
            #print list(voc)[:100]
            parameters = {
                'input':'content', 
                'encoding':'utf-8', 
                'decode_error':'ignore', 
                #'vocabulary':list(voc),
                'tokenizer':lambda text:_twokenize.tokenizeRawTweetText(text)
                #'max_df':0.9,
                #'min_df':5
                #'max_features':20000
               }
            self.counter.set_params(**parameters) 
            #print "Oleeeeeeeeeeeeeeeeeeeeeeeeee"
            #print texts
            #print tokens
            #print list(voc)
            target_profiles = sorted(list(set(y)))
            print len(target_profiles)
            #return
            doc_term = self.counter.fit_transform(X)
            print "Doc_Terms"
            print doc_term.shape
            #return 
            #X1 = X.toarray()
            #X1 = X1.astype('float', casting='unsafe')
            target_profiles = sorted(list(set(y)))
            self.labels = target_profiles
            doc_prof = numpy.zeros([doc_term.shape[0], len(target_profiles)])
            for i in range(0, doc_term.shape[0]):
                tmp = numpy.zeros([1,len(target_profiles)])
                tmp[0, target_profiles.index(y[i])] = 1
                doc_prof[i,:] = tmp
            print "Doc_Prof"
            print doc_prof.shape
            term_prof = numpy.zeros([doc_term.shape[1], len(target_profiles)])
            term_prof = numpy.dot(numpy.log2(doc_term.toarray().astype('float', casting='unsafe').T + 1), doc_prof)
            print "Term_Prof"
            print term_prof.shape
            term_prof = term_prof / numpy.reshape(term_prof.sum(axis=1), (term_prof.sum(axis=1).shape[0], 1))
            #term_prof = term_prof / term_prof.sum(axis=0)
            self.term_table = term_prof
            print "GG"
            return self
    
    def transform(self, X):
        
        import numpy
        
        if self.labels==None:
            raise AttributeError('term_table was no found! Probably model was not fitted first. Run model.fit(X,y)!')
        else:
            doc_term = self.counter.transform(X)
            doc_prof = numpy.zeros([doc_term.shape[0], self.term_table.shape[1]])
            doc_prof = numpy.dot(doc_term.toarray().astype('float', casting='unsafe'), self.term_table)
            return doc_prof
            
    def predict(self, X):
        
        import numpy
        
        doc_prof = self.transform(X)
        y_pred = []
        for i in range(0, doc_prof.shape[0]):
            y_pred.append(self.labels[numpy.argmax(doc_prof[i])])
        return y_pred
        

In [None]:
from pan.misc import _twokenize
import pan
reload(pan)
c = SOA_Model2()
c.fit(X,y)

In [None]:
y_pred = c.predict(X)


In [None]:
for k, v in c.counter.vocabulary_.iteritems():
    if v>8000 and v< 9000:
        pass
        #print k, v
top_words = [[] for i in range(0, c.term_table.shape[1])]
cc= 0
for i in range(0, c.term_table.shape[0]):
    if max(c.term_table[i]) > 0.7:
        #print c.term_table[i], c.counter.vocabulary_.keys()[c.counter.vocabulary_.values().index(i)]
        top_words[list(c.term_table[i]).index(max(c.term_table[i]))].append(c.counter.vocabulary_.keys()[c.counter.vocabulary_.values().index(i)])
        cc += 1
top_words
        #c.term_table /= c.term_table[8411].sum(axis= 0)
#c.term_table[8411]

In [None]:
import numpy
tmp = numpy.zeros([2,4])
tmp[0,1]=1
tmp[0,3]= 1
tmp[1,2] = 1
tmp / numpy.reshape(tmp.sum(axis=1), (tmp.sum(axis=1).shape[0], 1))

In [None]:
import pan
reload(pan.features)
c = pan.features.SOA_Model2()
c.fit_transform(X, y)
a = ["I am very good!"]
#c.transform(X, y)
#from pprint import pprint
#pprint(dataset.get_data()[0])
#pprint(dataset.entries[0].texts[0])

In [None]:
X[0]

In [None]:
import numpy
aaa = numpy.asarray([[1, 2], [3, 4]], dtype=float)
bb = aaa.sum(axis=1)
print numpy.reshape(bb, (1,2))
print aaa/numpy.reshape(bb, (bb.shape[0],1))
print aaa/aaa.sum(axis=0)

In [None]:
import numpy
from sklearn.preprocessing import normalize
aaa = numpy.asarray([[1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=float)
print "sci-kit"
cc = normalize(aaa, axis=1, norm='l1')
print normalize(aaa, axis=1, norm='l1')
print normalize(cc, axis=0, norm='l1')
print numpy.sum(aaa,axis=1, keepdims=True)
print numpy.linalg.norm(aaa, axis=1)
aaa = numpy.true_divide(aaa, numpy.sum(aaa,axis=1, keepdims=True), dtype=float)
print numpy.sum(aaa,axis=0, keepdims=True)
aaa = numpy.true_divide(aaa, numpy.sum(aaa,axis=0, keepdims=True), dtype=float)
print aaa

In [None]:
#import pan
reload(pan.features)
log = []
#import logging
#log = logging.getLogger()
#log.setLevel(logging.INFO)
#log.addHandler(logging.StreamHandler())
infolder = "./pan15-author-profiling-training-dataset-2015-04-23/pan15-author-profiling-training-dataset-english-2015-04-23/"
modelfile
print('Loading dataset...')
dataset = ProfilingDataset(infolder)
print('Loaded %s users...\n' % len(dataset.entries))
config = dataset.config
tasks = config.tasks
all_models = {}
for task in tasks:
    print('Learning to judge %s..' % task)
    # load data
    if task == "age":
        X, y = dataset.get_data(task)
        tictac = from_recipe(config.recipes[task])
        all_models[task] = tictac.fit(X, y)
print('Writing model to {}'.format(modelfile))

In [None]:
import tictacs
tictacs.__file__

In [None]:
a = set()
a.add(1)
a.add(2)
a.add(3)
print(a[0:2])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
a = CountVectorizer(input=)

In [None]:
reload(pan.features)
c = pan.features.SOA_Model()
a = ["I am very good!"]
#aa = c.fit_transform(a)
print b
c.fit([b])
print aa
print c.counter.vocabulary_
kk = c.transform([b])
print kk

In [None]:
import gensim

documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
            "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]
 # remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
          for document in documents]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]
from pprint import pprint   # pretty-printer
pprint(texts)
dictionary = gensim.corpora.Dictionary(texts)

In [None]:
corpus = [dictionary.doc2bow(text) for text in texts]
model = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=5)
model.print_topics()

In [None]:
model.log_perplexity(corpus)

In [None]:
pow(2,11)

In [None]:
#!/usr/bin/python

from argparse import ArgumentParser
from pan import ProfilingDataset
from tictacs import from_recipe
from sklearn.grid_search import GridSearchCV
#reload(pan.features)



log = []


def cross_val(dataset, task, model, num_folds=4):
    """ train and cross validate a model

    :lang: the language
    :task: the task we want to classify for , ex: age

    """

    #X, y = dataset.get_data(task)
    docs = createDocProfiles(dataset)
    X, y = create_target_prof_trainset(docs, task)
    del docs
    #return X
    # get parameters for grid search if it exists - else pass empty dict
    params = model.grid_params if hasattr(model, 'grid_params') else dict()
    print '\nCreating model for %s - %s' % (dataset.lang, task)
    print 'Using %s fold validation' % (num_folds)
    # get data
    #log.append('\nResults for %s - %s with classifier %s' %
    #           (dataset.lang, task, model.__class__.__name__))
    if task in dataset.config.classifier_list:
        grid_cv = GridSearchCV(model, params, cv=num_folds, verbose=1,
                               n_jobs=-1)
        grid_cv.fit(X, y)
        accuracy = grid_cv.best_score_
        log.append('best params: %s' % grid_cv.best_params_)
        log.append('Accuracy mean : %s' % accuracy)
    else:
        # if it's not, we measure mean square root error (regression)
        grid_cv = GridSearchCV(model, params, scoring='mean_squared_error',
                               cv=num_folds, verbose=1, n_jobs=-1)
        grid_cv.fit(X, y)
        accuracy = grid_cv.best_score_
        log.append('root mean squared error : %s' % accuracy)

if __name__ == '__main__':
    parser = ArgumentParser(description='Train a model with crossvalidation'
                            ' on pan dataset - used for testing purposes ')
    parser.add_argument('-i', '--input', type=str,
                        required=True, dest='infolder',
                        help='path to folder with pan dataset for a language')
    parser.add_argument('-n', '--numfolds', type=int,
                        dest='num_folds', default=4,
                        help='Number of folds to use in cross validation')

num_folds = 2
infolder = "./pan15-author-profiling-training-dataset-2015-04-23/pan15-author-profiling-training-dataset-english-2015-04-23/"

print('Loading dataset...')
dataset = ProfilingDataset(infolder)
print('Loaded %s users...\n' % len(dataset.entries))
config = dataset.config
tasks = config.tasks
print('\n--------------- Thy time of Running ---------------')
for task in tasks:
    if task == "age":
        tictac = from_recipe(config.recipes[task])
        z = cross_val(dataset, task, tictac, num_folds)
        # print results at end
        print('\n--------------- Thy time of Judgement ---------------')
    for message in log:
        print(message)


all_models = {}
docs = createDocProfiles(dataset)
for task in tasks:
    if task =='age':
        print('Learning to judge %s..' % task)
        # load data
        X, y = create_target_prof_trainset(docs, task)
        #X, y = dataset.get_data(task)
        tictac = from_recipe(config.recipes[task])
        all_models[task] = tictac.fit(X, y)

In [None]:

from gensim import corpora, models, similarities

documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
          for document in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
import gensim

model =gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=100, minimum_probability=0)
a = model[corpus[0]]

In [None]:
lsi = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=2)
lsi[corpus[2]]
import numpy
#len(dictionary)

In [None]:
c = lsi[corpus]
l = [list(zip(*cc)[1]) for cc in c]
#l = []
#for cc in c:
    #print cc
#    l.append(list(zip(*cc)[1]))
print numpy.array(l)
    #print list(zip(*cc)[1])
    #for k in cc:
    #    print k

In [None]:
import pan.features
reload(pan.features)
pan.features.TWCNB.__dict__

In [None]:
import time
from argparse import ArgumentParser
from pan import ProfilingDataset, createDocProfiles, create_target_prof_trainset
from tictacs import from_recipe
from json import dumps
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score

log = []


def cross_val(dataset, task, model, num_folds=4):
    """ train and cross validate a model

    :lang: the language
    :task: the task we want to classify for , ex: age

    """

    # if (task != "age") and (task !="gender"):
    #    X, y = dataset.get_data(task)
    # else:
    #    docs = createDocProfiles(dataset)
    #    X, y = create_target_prof_trainset(docs, task)
    X, y = dataset.get_data(task)
    # y = [yy.lower() for yy in y]
    # get parameters for grid search if it exists - else pass empty dict
    params = model.grid_params if hasattr(model, 'grid_params') else dict()
    # from collections import Counter
    # import pprint
    # pprint.pprint(Counter(y))
    print '\nCreating model for %s - %s' % (dataset.lang, task)
    print 'Trainining instances: %s\n' % (len(X))
    print 'Using %s fold validation' % (num_folds)
    # get data
    log.append('\nResults for %s - %s with classifier %s' %
               (dataset.lang, task, model.__class__.__name__))
    if task in dataset.config.classifier_list:
        grid_cv = GridSearchCV(model, params, cv=num_folds, verbose=1,
                               n_jobs=-1)
        grid_cv.fit(X, y)
        # y_pred = grid_cv.best_estimator_.predict(X)
        # pprint.pprint(y_pred)
        # pprint.pprint(y)
        # conf = confusion_matrix(y, y_pred, labels=list(set(y)))
        accuracy = grid_cv.best_score_
        # accuracy2 = accuracy_score(y, y_pred)
        log.append('best params: %s' % grid_cv.best_params_)
        log.append('Accuracy mean : %s' % accuracy)
        import pprint
        pprint.pprint(grid_cv.grid_scores_)
        with open('./comb_res/res.txt', 'a') as out:
            out.write(' Results: %s - %s, params: %s ,Accuracy_Mean: %s\n' %
                      (dataset.lang, task,
                       dumps(grid_cv.best_params_), grid_cv.best_score_))
        # log.append('Best accuracy: {} '.format(accuracy2))
        # log.append('Best Confusion matrix :\n {}'.format(conf))
    else:
        # if it's not, we measure mean square root error (regression)
        raise KeyError('task %s was not found in task list!' % task)



infolder = '../DATA/pan16-author-profiling-training-dataset-2016-02-29/pan16-author-profiling-training-dataset-english-2016-02-29/'
num_folds = 3
time_start = time.time()
print('Loading dataset...')
dataset = ProfilingDataset(infolder)
print('Loaded %s users...\n' % len(dataset.entries))
config = dataset.config
tasks = config.tasks
print('\n--------------- Thy time of Running ---------------')
for task in tasks:
    tictac = from_recipe(config.recipes[task])
    import pprint
    #pprint.pprint(tictac.__dict__)
    #exit(1)
    steps = tictac.steps
    #print type(steps)
    outline = ""
    for step in steps:
        if step[0]=="features":
            # print type(step[1])
            for tf in step[1].transformer_list:
                #print type(tf[1])
                #print type(tf[1].get_params())
                outline += tf[0] + " with Params:[" + str(tf[1].get_params()) + "]+"
        else:
#            if hasattr(step[1], 'get_params'):
#                outline += step[0] + " with Params:[" + str(step[1].get_params()) + "]+"
#            else:
#                outline += step[0]+ "+"
            outline += step[0]+ "+"
    outline = outline[:-1] + "\n"
    print('Task:{}, Pipeline:{}'.format(task, outline))
    with open('./comb_res/res.txt', 'a') as out:
        out.write('Task:{}, Pipeline:{}'.format(task, outline))
    cross_val(dataset, task, tictac, num_folds)
# print results at end
print('\n--------------- Thy time of Judgement ---------------')
print ('Time: {} seconds.\n'.format(str(time.time()-time_start)))
with open('./comb_res/res.txt', 'a') as out:
    out.write('Time: {} seconds.\n'.format(str(time.time()-time_start)))
for message in log:
    print(message)


In [None]:
import dill
dill.pickles(tictac)
dill.detect.badtypes(tictac).__dict__.keys()

In [None]:
##### TRAIN ############


#!/usr/bin/python

import os
from argparse import ArgumentParser
from sklearn.externals import joblib
from tictacs import from_recipe
from pan import ProfilingDataset
import dill
import cPickle as pickle
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import accuracy_score, confusion_matrix


infolder = "../pan16-author-profiling-training-dataset-2016-02-29/pan16-author-profiling-training-dataset-english-2016-02-29/"
outfolder = "models/"
print('Loading dataset->Grouping User texts.\n')
dataset = ProfilingDataset(infolder)
print('Loaded {} users...\n'.format(len(dataset.entries)))
# get config
config = dataset.config
tasks = config.tasks
print('\n--------------- Thy time of Running ---------------')
all_models = {}
for task in tasks:
    print('Learning to judge %s..' % task)
    # load data
    X, y = dataset.get_data(task)
    tictac = from_recipe(config.recipes[task])
    all_models[task] = tictac.fit(X, y)
modelfile = os.path.join(outfolder, '%s2.bin' % dataset.lang)
print('Writing model to {}'.format(modelfile))
#fo = open(modelfile,  'wb')
#import pprint
#print type(all_models)
#print modelfile
#dill.dump(all_models, fo, protocol=pickle.HIGHEST_PROTOCOL)
#fo.close()
# pickle.dump(all_models, modelfile)
# dill.dump(all_models, modelfile)
joblib.dump(all_models, modelfile, compress=3)

In [None]:
import numpy

a = numpy.array([[1,2],[3,4]], dtype=float)
b = numpy.array([[0.1,0.2],[0.3,0.4]], dtype=float)
type(a[0,0])

In [None]:
a=1

In [None]:
a = numpy.array([1,2,3,4])
print a.shape
b = numpy.tile(a, (5, 1))
b

In [None]:
c = b.sum(axis=1)
print c.shape, type(c)

In [None]:
from sklearn.preprocessing import normalize
import pprint
pprint.pprint(a)
normalize(a, norm='l1', axis=1, copy=False)
pprint.pprint(a)

In [57]:
import os
from argparse import ArgumentParser
from sklearn.externals import joblib
from tictacs import from_recipe
from pan import ProfilingDataset
import dill
import cPickle as pickle
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import accuracy_score, confusion_matrix


infolder = "../DATA/pan16-author-profiling-training-dataset-2016-04-25/pan16-author-profiling-training-dataset-english-2016-02-29/"
outfolder = "models/"
print('Loading dataset->Grouping User texts.\n')
dataset = ProfilingDataset(infolder)
print('Loaded {} users...\n'.format(len(dataset.entries)))
# get config
config = dataset.config
tasks = config.tasks
print('\n--------------- Thy time of Running ---------------')
all_models = {}
for task in tasks:
    print('Learning to judge %s..' % task)
    # load data
    X, y = dataset.get_data(task)

Loading dataset->Grouping User texts.

Loaded 436 users...


--------------- Thy time of Running ---------------
Learning to judge age..
Learning to judge gender..


In [3]:
from sklearn.cross_validation import train_test_split
from collections import Counter
import pprint
print "Num of samples: " + str(len(y))
pprint.pprint(Counter(y))
X, y = dataset.get_data('age')
print len(X)

X, X_cv, X, y_cv = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_cv, X_test, y_cv, y_test = train_test_split(X_cv, y_cv, test_size=0.5, random_state=42, stratify=y_cv)

print len(X_cv), len(X_test), len(X) , len(X)+ len(X_cv) + len(X_test)
pprint.pprint(Counter(y))
pprint.pprint(Counter(y_cv))
pprint.pprint(Counter(y_test))

Num of samples: 436
Counter({'35-49': 182, '25-34': 140, '50-64': 80, '18-24': 28, '65-xx': 6})
436
87 87 262 436
Counter({'35-49': 182, '25-34': 140, '50-64': 80, '18-24': 28, '65-xx': 6})
Counter({'35-49': 36, '25-34': 28, '50-64': 16, '18-24': 6, '65-xx': 1})
Counter({'35-49': 37, '25-34': 28, '50-64': 16, '18-24': 5, '65-xx': 1})


In [58]:
#reload(preprocess)
#reload(features)
from pan import features
from pan import preprocess
X, y = dataset.get_data('age')
#X, y = dataset.get_data('gender')
print len(X)
#print X[0]
X = preprocess.preprocess(X)
#print "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$"
#print X[0]

436
    -Cleaning html
    -Detwittifying
    -Removing Numbers
    -Removing Punctuation
    -Removing Links


### 3grams+soa+soac

In [4]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

grams3 = TfidfVectorizer(analyzer='word', ngram_range=[3,3], max_features=5000, stop_words='english')
soa = features.SOA_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=None)
soac = features.SOAC_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=5000)
countTokens = features.CountTokens()
countHash = features.CountHash()
countUrls = features.CountURLs()
countReplies = features.CountReplies()
svm = SVC(kernel='rbf', C=1, gamma=1, class_weight='balanced', probability=True)
#svm = DecisionTreeClassifier()
combined = FeatureUnion([('3grams', grams3), ('soa', soa)])
pipe = Pipeline([('combined',combined), ('svm', svm)])
pipe.steps

[('combined', FeatureUnion(n_jobs=1,
         transformer_list=[('3grams', TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
          dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
          lowercase=True, max_df=1.0, max_features=5000, min_df=1,
          ngram_range=[3, 3], norm=u'l2', preprocessor=None, smo...ulary=None)), ('soa', SOA_Model2(max_df=1.0, max_features=None, min_df=5, tokenizer_var='sklearn'))],
         transformer_weights=None)),
 ('svm', SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

### Counts + SOA+SOAC. Ommit preprocess!!


In [71]:
reload(features)
features.SOAC_Model2.__doc__

' Complementary of SOA model 22'

In [5]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


grams3 = TfidfVectorizer(analyzer='word', ngram_range=[3,3], max_features=5000, stop_words='english')
countTokens = features.CountTokens()
countHash = features.CountHash()
countUrls = features.CountURLs()
countReplies = features.CountReplies()
soa = features.SOA_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=None)
soac = features.SOAC_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=5000)
scaler = StandardScaler()#MinMaxScaler()#StandardScaler()
#svm = DecisionTreeClassifier()
svm = SVC(kernel='rbf', C=0.1, gamma=1, class_weight='balanced', probability=True)
#combined = FeatureUnion([('soa', soa), ('soac', soac)])
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies), 
#                          ('soa', soa), ('soac', soac)])
combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
                         ('count_urls', countUrls), ('count_replies', countReplies)])
pipe = Pipeline([('3grams', grams3), ('svm', svm)])
#pipe = Pipeline([('soac',soac), ('svm', svm)])
#pipe = Pipeline([('combined',combined), ('svm', svm)])
pipe.steps

[('3grams',
  TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
          dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
          lowercase=True, max_df=1.0, max_features=5000, min_df=1,
          ngram_range=[3, 3], norm=u'l2', preprocessor=None, smooth_idf=True,
          stop_words='english', strip_accents=None, sublinear_tf=False,
          token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('svm', SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [None]:
### LDA

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from pan.features import LDA

LDAmodel = LDA(num_topics=30, lib='sklearn')
soa = features.SOA_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=None)
soac = features.SOAC_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=5000)
countTokens = features.CountTokens()
countHash = features.CountHash()
countUrls = features.CountURLs()
countReplies = features.CountReplies()
#svm = SVC(kernel='rbf', C=1, gamma=1, class_weight='balanced')
svm = DecisionTreeClassifier()
combined = FeatureUnion([('LDA', LDAmodel)])#, ('soa', soa), ('soac', soac)])
pipe = Pipeline([('combined',combined), ('svm', svm)])
pipe.steps

In [None]:
LDAA = grid_search.best_estimator_.steps[0][1].__dict__['transformer_list'][0][1]
def print_top_words(model, feature_names, n_top_words):

    for topic_idx, topic in enumerate(model.components_):
        topic_words = " ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]])
        print("#%d: " % topic_idx + topic_words)
        #print(" ".join([feature_names[i]
        #                for i in topic.argsort()[:-n_top_words - 1:-1]]))
#print_top_words(LDAA.LDA, LDAA.counter.get_feature_names(), 10)

def get_top_words(model, feature_names, n_top_words):
     
    feat = []
    for topic_idx, topic in enumerate(model.components_):
        topic_words = " ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]])
        feat.append("#%d: " % topic_idx + topic_words)
        #print("#%d: " % topic_idx + topic_words)
    return feat
get_top_words(LDAA.LDA, LDAA.counter.get_feature_names(), 10)

In [None]:
feature_names = get_top_words(LDAA.LDA, LDAA.counter.get_feature_names(), 10)
print len(feature_names)
#soa_feat_names = ["soa_prob_"+str(i) for i in range(0, len(set(y)))]
#soac_feat_names = ["soac_prob_"+str(i) for i in range(0, len(set(y)))]
#feature_names += soa_feat_names
#feature_names += soac_feat_names
feature_names = [feat.encode('utf-8') for feat in feature_names]
print len(feature_names)

### Counts + soa + Soac Features

In [None]:
len(X)

In [None]:

import copy
feature_names = copy.deepcopy(countTokens.l)
feature_names += ['numHash', 'numUrl', 'numRep']
#soa_feat_names = ["soa_prob_"+str(i) for i in range(0, len(set(y)))]
#soac_feat_names = ["soac_prob_"+str(i) for i in range(0, len(set(y)))]
#feature_names += soa_feat_names
#feature_names += soac_feat_names
feature_names = [feat.encode('utf-8') for feat in feature_names]
print len(countTokens.l), len(feature_names)

In [None]:
reload(features)
#features.SOAC_Model2.__doc__
soac = features.SOAC_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=5000)
#y[0:10]

In [None]:
XX = [#"I like playing video games very much :).", 
     #"Football games are the best!",
     #"Being young forever is very funny and entertaining",
     # "Football games are the best!",
      "best games",
      "best games",
     #"World leaders should gather and decide for todays meeting!",
     #"Problems nowadays seem to thrive everywhere",
     #"Just got off from work today! Weekend is coming though, so it's alright...",
     #"This weekend we are going of for 3 days..",
     " Weekend alright...",
     " Weekend alright...",
     " Weekend alright...",
     "Awful weather",
     "Awful weather",
     "Awful weather",
     "Awful weather",
     "Awful weather"]
yy = ["18-24",
     "18-24",
     "25-34",
     "25-34",
     "25-34",
     "35-49",
     "35-49",
     "35-49",
     "35-49",
     "35-49",
    ]
#reload(preprocess)
#reload(features)
from pan import features
from pan import preprocess
from sklearn.grid_search import GridSearchCV
XX = preprocess.preprocess(XX)
num_folds = 2
grid_search = GridSearchCV(estimator=pipe, param_grid={}, verbose=1, n_jobs=-1, cv=num_folds, refit=True)
grid_search.fit(XX,yy)
print(grid_search.best_estimator_)
print(grid_search.best_score_)

In [None]:
xx = soac.fit_transform(X[0:10], y[0:10])
print xx
print y[0:10]

In [6]:
soac = features.SOAC_Model2(max_df=1.0, min_df=1, tokenizer_var='sklearn', max_features=None)
combined = FeatureUnion([('soac', soac)])
svm = SVC(kernel='rbf', C=0.1, gamma=1, class_weight='balanced', probability=True)
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies), 
#                          ('soa', soa), ('soac', soac)])
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies)])
pipe1 = Pipeline([('combined',combined), ('svm', svm)])
pipe1.steps

[('combined', FeatureUnion(n_jobs=1,
         transformer_list=[('soac', SOAC_Model2(max_df=1.0, max_features=None, min_df=1, thres=0.1,
        tokenizer_var='sklearn'))],
         transformer_weights=None)),
 ('svm', SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [35]:
svm = SVC(kernel='rbf', C=0.1, gamma=1, class_weight='balanced', probability=True)
soa = features.SOA_Model2(max_df=1.0, min_df=5, tokenizer_var='sklearn', max_features=None)
combined = FeatureUnion([('soa', soa)])
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies), 
#                          ('soa', soa), ('soac', soac)])
#combined = FeatureUnion([('count_tokens', countTokens), ('count_hash', countHash),
#                         ('count_urls', countUrls), ('count_replies', countReplies)])
pipe2= Pipeline([('combined',combined), ('svm', svm)])
pipe2.steps

[('combined', FeatureUnion(n_jobs=1,
         transformer_list=[('soa', SOA_Model2(max_df=1.0, max_features=None, min_df=5, tokenizer_var='sklearn'))],
         transformer_weights=None)),
 ('svm', SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]

In [None]:
from sklearn.grid_search import GridSearchCV

num_folds = 4
grid_search1 = GridSearchCV(estimator=pipe1, param_grid={}, verbose=1, n_jobs=-1, cv=num_folds, refit=True)
grid_search1.fit(X,y)
print(grid_search1.best_estimator_)
print(grid_search1.best_score_)
grid_search2 = GridSearchCV(estimator=pipe2, param_grid={}, verbose=1, n_jobs=-1, cv=num_folds, refit=True)
grid_search2.fit(X,y)
print(grid_search2.best_estimator_)
print(grid_search2.best_score_)

In [59]:
from sklearn.grid_search import GridSearchCV

num_folds = 4
grid_search = GridSearchCV(estimator=pipe1, param_grid={}, verbose=1, n_jobs=-1, cv=num_folds, refit=True)
grid_search.fit(X,y)
print(grid_search.best_estimator_)
print(grid_search.best_score_)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
We are fitting!
[[ 15.57142857   3.11428571   2.3956044    5.45        72.66666667]]We are fitting!
We are fitting!
We are fitting!
We are fitting!
[[ 15.52380952   3.1047619    2.39705882   5.43333333  81.5       ]][[ 15.52380952   3.1047619    2.39705882   5.43333333  81.5       ]][[ 15.61904762   3.12380952   2.39416058   5.46666667  65.6       ]][[ 15.61904762   3.12380952   2.39416058   5.46666667  65.6       ]]



We are transforming!We are transforming!
We are transforming!
We are transforming!

Doc_profDoc_profDoc_profDoc_prof



(326, 5) <type 'numpy.ndarray'>
(328, 5) <type 'numpy.ndarray'>
(328, 5) <type 'numpy.ndarray'>
(326, 5) <type 'numpy.ndarray'>
We are transforming!
We are transforming!
We are transforming!
We are transforming!
Doc_profDoc_profDoc_profDoc_prof



(110, 5) <type 'numpy.ndarray'>
(108, 5) <type 'numpy.ndarray'>
(108, 5) <type 'numpy.ndarray'>
(110, 5) <type 'numpy.ndarray'>

We are transforming!

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    7.0s finished


In [8]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import VotingClassifier
from sklearn.grid_search import GridSearchCV


num_folds = 4
split = 0.2
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=split, random_state=42, stratify=y)
print len(X_train), len(X_cv), len(X_cv) + len(X_train), len(X)
eclf = VotingClassifier(estimators=[("0", pipe), ('1', pipe1)], voting='soft')
trained_models = []
for model in [pipe, pipe1, eclf]:
    grid_search = GridSearchCV(estimator=model, param_grid={}, verbose=1, n_jobs=-1, cv=num_folds, refit=True)
    grid_search.fit(X_train,y_train)
    print(grid_search.best_score_)
    print(grid_search.best_estimator_) 
    trained_models.append(grid_search.best_estimator_)

348 88 436 436
Fitting 4 folds for each of 1 candidates, totalling 4 fits
We are fitting!
Doc_TermsWe are fitting!
We are fitting!
We are fitting!
We are fitting!
Doc_TermsDoc_TermsDoc_TermsDoc_Terms



(259, 13537)
(261, 14138)
(262, 14058)
(262, 13356)
Doc_Prof
Doc_Prof
Doc_Prof
Doc_Prof
(259, 5) <type 'numpy.ndarray'>
(261, 5) <type 'numpy.ndarray'>
(262, 5) <type 'numpy.ndarray'>
(262, 5) <type 'numpy.ndarray'>
Doc_Term
Doc_Term
Doc_Term
Doc_Term
(259, 13537) <class 'scipy.sparse.csr.csr_matrix'>
(261, 14138) <class 'scipy.sparse.csr.csr_matrix'>
(262, 14058) <class 'scipy.sparse.csr.csr_matrix'>
(262, 13356) <class 'scipy.sparse.csr.csr_matrix'>
Term_Prof
Term_Prof
Term_Prof
Term_Prof
(13537, 5) <type 'numpy.ndarray'>
(14138, 5) <type 'numpy.ndarray'>
(14058, 5) <type 'numpy.ndarray'>
(13356, 5) <type 'numpy.ndarray'>
Random Term_Prof
Random Term_Prof
Random Term_Prof
Random Term_Prof
[ 0.          0.44151325  0.06550793  0.49297881  0.        ]
[ 0.          0.72245332  0.2775466

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   13.3s finished


We are fitting!
[[ 15.81818182   3.10714286   2.4          5.4375      69.6       ]]We are fitting!
We are fitting!
We are fitting!
We are fitting!
[[ 16.1875       3.08333333   2.39814815   5.39583333  86.33333333]][[ 16.3125       3.10714286   2.39449541   5.4375      65.25      ]][[ 15.41176471   3.11904762   2.40366972   5.45833333  65.5       ]][[ 15.41176471   3.11904762   2.40366972   5.45833333  65.5       ]]



We are transforming!
We are transforming!
We are transforming!
We are transforming!
Doc_profDoc_profDoc_profDoc_prof



(259, 5) <type 'numpy.ndarray'>
(261, 5) <type 'numpy.ndarray'>
(262, 5) <type 'numpy.ndarray'>
(262, 5) <type 'numpy.ndarray'>
We are transforming!
We are transforming!
We are transforming!
We are transforming!
Doc_profDoc_profDoc_profDoc_prof



(89, 5) <type 'numpy.ndarray'>
(87, 5) <type 'numpy.ndarray'>
(86, 5) <type 'numpy.ndarray'>
(86, 5) <type 'numpy.ndarray'>

We are transforming!
Doc_prof
(348, 5) <type 'numpy.ndarray'>
0.454022988506
Pipeli

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    4.0s finished


We are fitting!
Doc_TermsWe are fitting!
We are fitting!
We are fitting!
We are fitting!
Doc_TermsDoc_TermsDoc_TermsDoc_Terms



(259, 13537)
(261, 14138)
(262, 14058)
(262, 13356)
Doc_Prof
Doc_Prof
Doc_Prof
Doc_Prof
(259, 5) <type 'numpy.ndarray'>
(261, 5) <type 'numpy.ndarray'>
(262, 5) <type 'numpy.ndarray'>
(262, 5) <type 'numpy.ndarray'>
Doc_Term
Doc_Term
Doc_Term
Doc_Term
(259, 13537) <class 'scipy.sparse.csr.csr_matrix'>
(261, 14138) <class 'scipy.sparse.csr.csr_matrix'>
(262, 14058) <class 'scipy.sparse.csr.csr_matrix'>
(262, 13356) <class 'scipy.sparse.csr.csr_matrix'>
Term_Prof
Term_Prof
Term_Prof
Term_Prof
(13537, 5) <type 'numpy.ndarray'>
(14138, 5) <type 'numpy.ndarray'>
(14058, 5) <type 'numpy.ndarray'>
(13356, 5) <type 'numpy.ndarray'>
Random Term_Prof
Random Term_Prof
Random Term_Prof
Random Term_Prof
[ 0.          0.44151325  0.06550793  0.49297881  0.        ]
[ 0.          0.72245332  0.27754668  0.          0.        ]
[ 0.          0.34643333  0.17271636  0.4808503

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   16.8s finished


In [9]:
predictions = []
for model in trained_models:
    predict = model.predict(X_cv)
    predictions.append(predict)
    acc = accuracy_score(y_cv, predict)
    conf = confusion_matrix(y_cv, predict, labels=list(set(y)))
    print('Accuracy : {}'.format(acc))
    print('Confusion matrix :\n {}'.format(conf))

We are transforming!
Doc_Terms
(88, 16756) <class 'scipy.sparse.csr.csr_matrix'>
SOA Transform:
Doc_prof
(88, 5) <type 'numpy.ndarray'>
[ 3.1563402   3.32837876  3.16486118  2.96139769  1.89792746]
Len Voc: 16756
Accuracy : 0.534090909091
Confusion matrix :
 [[15 12  1  0  0]
 [ 6 29  2  0  0]
 [ 4 10  2  0  0]
 [ 3  2  0  1  0]
 [ 0  1  0  0  0]]
We are transforming!
Doc_prof
(88, 5) <type 'numpy.ndarray'>
Accuracy : 0.511363636364
Confusion matrix :
 [[ 9 17  0  2  0]
 [ 2 33  0  2  0]
 [ 1 14  1  0  0]
 [ 2  2  0  2  0]
 [ 0  1  0  0  0]]
We are transforming!
Doc_Terms
(88, 16756) <class 'scipy.sparse.csr.csr_matrix'>
SOA Transform:
Doc_prof
(88, 5) <type 'numpy.ndarray'>
[ 3.1563402   3.32837876  3.16486118  2.96139769  1.89792746]
Len Voc: 16756
We are transforming!
Doc_prof
(88, 5) <type 'numpy.ndarray'>
Accuracy : 0.568181818182
Confusion matrix :
 [[15 12  1  0  0]
 [ 5 32  0  0  0]
 [ 5  9  2  0  0]
 [ 3  2  0  1  0]
 [ 0  1  0  0  0]]


In [30]:
predictions[2]

array(['35-49', '25-34', '25-34', '25-34', '35-49', '50-64', '35-49',
       '35-49', '35-49', '25-34', '35-49', '25-34', '50-64', '35-49',
       '35-49', '25-34', '25-34', '35-49', '35-49', '35-49', '25-34',
       '25-34', '35-49', '25-34', '25-34', '35-49', '35-49', '35-49',
       '35-49', '35-49', '35-49', '25-34', '35-49', '25-34', '25-34',
       '25-34', '35-49', '25-34', '35-49', '35-49', '25-34', '25-34',
       '35-49', '35-49', '25-34', '25-34', '35-49', '35-49', '35-49',
       '25-34', '50-64', '25-34', '35-49', '25-34', '35-49', '25-34',
       '25-34', '25-34', '25-34', '35-49', '35-49', '25-34', '25-34',
       '18-24', '35-49', '35-49', '25-34', '35-49', '50-64', '25-34',
       '35-49', '35-49', '25-34', '50-64', '35-49', '35-49', '35-49',
       '35-49', '18-24', '35-49', '35-49', '35-49', '25-34', '25-34',
       '25-34', '18-24', '50-64', '50-64', '35-49', '25-34', '35-49',
       '35-49', '25-34', '25-34', '35-49', '35-49', '25-34', '25-34',
       '25-34', '25-

### 3grams + soa + Soac Features

In [62]:
#feature_names = grid_search.best_estimator_.steps[0][1].__dict__['transformer_list'][0][1].get_feature_names()
#print len(set(y))
feature_names = []
soa_feat_names = ["soa_prob_"+str(i) for i in range(0, len(set(y)))]
soac_feat_names = ["soac_prob_"+str(i) for i in range(0, len(set(y)))]
#feature_names += soa_feat_names
feature_names += soac_feat_names
print len(feature_names)
feature_names = [feat.encode('utf-8') for feat in feature_names]

5


In [63]:
a = grid_search.best_estimator_.steps[0][1]
#print a.transform(X).shape

In [64]:
import pandas as pd
data = pd.DataFrame(a.transform(X), columns=feature_names)
data["class"] = y
print(data.describe())

We are transforming!
Doc_prof
(436, 5) <type 'numpy.ndarray'>
       soac_prob_0  soac_prob_1  soac_prob_2  soac_prob_3  soac_prob_4
count   436.000000   436.000000   436.000000   436.000000   436.000000
mean      0.334800     0.333926     0.323147     0.335459     0.333121
std       0.245069     0.370890     0.390482     0.307089     0.231141
min       0.000000     0.000000     0.000000     0.000000     0.000000
25%       0.189950     0.000000     0.000000     0.141166     0.190086
50%       0.289293     0.261512     0.263432     0.289500     0.284627
75%       0.428288     0.512928     0.523927     0.448032     0.394424
max       1.692613     2.068010     3.497002     2.356757     1.905468


In [112]:
soacc = a.transformer_list[0][1]
voc = soacc.counter.vocabulary_
print 'Voc: ' + str(len(voc))
print soacc.term_table.shape
#terms= ['marriage', 'pension']
#graph_matrix = numpy.zeros([len(terms), soacc.term_table.shape[1]])
j = 0 
for term, index in voc.iteritems():
    l = list(soacc.term_table[index,:])
    if l.index(min(l))==3 and  min(l)<0.02 and min(l)!=0:
        print term
        print l
        j += 1
    if j==1000:
        break

Voc: 125532
(125532, 5)
fic
[0.2043907252514684, 0.28704097801964457, 0.30305073020109663, 0.012805472795373671, 0.19271209373241674]
pinoy
[0.20463188045707323, 0.28737965007037575, 0.29925827359601481, 0.015790726246659309, 0.19293946962987688]
hau
[0.20482151808954271, 0.28764597219153065, 0.29627599299366542, 0.018138245121083925, 0.19311827160417727]
directmarketing
[0.20405919797315172, 0.26927285675575857, 0.32042252787399861, 0.013845907887149334, 0.19239950950994181]
inspector
[0.2041459623102034, 0.2866972393595012, 0.30689992280281309, 0.009775559286662806, 0.19248131624081946]
revision
[0.20477137581790666, 0.28757555369933724, 0.29706454072077326, 0.017517535362481074, 0.19307099439950182]
producción
[0.20435280604767755, 0.27848828975660639, 0.30961943253292062, 0.01486313047808446, 0.19267634118471083]
bocetos
[0.20352640402345179, 0.28163933767385241, 0.31958591190228791, 0.003351187657352997, 0.19189715874305496]
abbe
[0.20416766387170535, 0.27359334048821976, 0.315787

In [113]:
import plotly.plotly as py
import plotly.graph_objs as go
import numpy
py.sign_in('Bogas', '9s60rarm2w')
soacc = a.transformer_list[0][1]
voc = soacc.counter.vocabulary_
print 'Voc: ' + str(len(voc))
print soacc.term_table.shape
terms= ['dreamjob','lol', 'mortgage', 'booksellers', 'juvenile']
graph_matrix = numpy.zeros([len(terms), soacc.term_table.shape[1]])
j = 0
for term in terms:
    idx = voc[term]
    print term
    print soacc.term_table[idx,:]
    graph_matrix[j, :] = soacc.term_table[idx,:]
    j += 1
    #plt.bar(numpy.arange(soacc.term_table.shape[1]), soacc.term_table[idx,:], color='r')
    #plt.show()

data = []
names = sorted(list(set(y)))
for i in range(0, soacc.term_table.shape[1]):
    data.append(
        go.Bar(
        x=terms,
        y=graph_matrix[:, i],
        name=names[i]
    )
    )
layout = go.Layout(
    barmode='group'
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)
#plot_url = py.plot(fig, filename='grouped-bar')

Voc: 125532
(125532, 5)
dreamjob
[ 0.01265381  0.26222892  0.31307008  0.22406249  0.18798469]
lol
[ 0.19276541  0.11180198  0.27522844  0.2184271   0.20177707]
mortgage
[ 0.2011231   0.2456243   0.14738652  0.20176227  0.2041038 ]
booksellers
[ 0.20404603  0.2865569   0.3084714   0.00853856  0.1923871 ]
juvenile
[ 0.19876243  0.27913675  0.27862587  0.22337202  0.02010292]


In [10]:
import pandas as pd
from numpy.random import randint
import matplotlib.pyplot as plt

grouped = data.groupby('class')
rowlength = grouped.ngroups/2                         # fix up if odd number of groups
fig, axs = plt.subplots(figsize=(9,4), 
                        nrows=2, ncols=rowlength,     # fix as above
                        gridspec_kw=dict(hspace=0.4)) # Much control of gridspec

targets = zip(grouped.groups.keys(), axs.flatten())
print targets
grouped.get_group('18-24').hist(alpha=0.4)
#for i, (key, ax) in enumerate(targets):
#    ax.plot(grouped.get_group(key))
#    ax.set_title('a=%s'%str(key))
#ax.legend()
#plt.show()

[('25-34', <matplotlib.axes._subplots.AxesSubplot object at 0x7f62e40e63d0>), ('35-49', <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6f0c290>), ('50-64', <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6eac590>), ('18-24', <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6ebddd0>)]


array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6e33390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6e05c10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6d8c310>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6d02290>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6cd9790>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6c4f810>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6bad110>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6b26350>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6bd6990>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f62c6a799d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c69f4950>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f62c69cfe50>]], dtype=object)

In [None]:
grouped = data.groupby('class')
grouped.mean().T

In [77]:
### BAR PLOTS OF MEAN VALUE OF FEATURES FOR EACH CLASS ######

grouped = data.groupby('class')
plt.figure()
grouped.mean().T.plot(kind='bar', figsize=(60,10))
plt.savefig('test1.png')
plt.show()

In [None]:
##### Distribution over a feature for each class #####

In [None]:
import matplotlib.pyplot as plt
grouped = data.groupby('class')
import numpy
ncol = 4 # pick one dimension
nrow = (len(feature_names)+ ncol-1) / ncol # make sure enough subplots
#fig, ax = plt.subplots(nrows=nrow, ncols=ncol, figsize=(6,6)) # create the axes
j = 0
for key in list(data.columns.values):
#    ix = numpy.unravel_index(j, ax.shape)
#    print ix
    print key
    if key!='class':
        j += 1
        plt.figure(j, figsize=(10,10))
        grouped[key].plot(kind='kde', alpha=0.8, legend=grouped.groups.keys(), title=key)
    #g = grouped[key]
    #print grouped[key].mean()
    #if j==1:
    #    tmp = g.mean()
    #else:
    #    print g.mean()
    #    tmp.append(g.mean())
    #print tmp
        plt.show()
    #if j==2:
    #    break
#tmp
    #break
    #ax[ix] = grouped[key].plot(kind='kde', alpha=0.4, legend=grouped.groups.keys())
    #break
#for key in grouped.keys:
#    grouped[key].plot(kind='kde', alpha=0.4, legend=grouped.groups.keys())
#for key in grouped.groups.keys():
#    b = grouped.get_group(key)
#    b.plot('kin')

In [None]:
import numpy
ncol = 4 # pick one dimension
nrow = (len(feature_names)+ ncol-1) / ncol # make sure enough subplots
fig, ax = plt.subplots(nrows=nrow, ncols=ncol) # create the axes
j = 0
for i in feature_names: 
    ix = numpy.unravel_index(j, ax.shape)
    #print ix
    j += 1
    ax[ix] = data.groupby('class').i.hist(alpha=0.4)   # go over a linear list of data # compute an appropriate index (1d or 2d)
    #feat = feature_names[i]
    #data.groupby('class').feat.hist(alpha=0.4, ax=ax[i])
import matplotlib.pyplot as plt
#%matplotlib outline
plt.savefig('CameraEvolution.png', bbox_inches='tight')
plt.show()

In [None]:
clf = grid_search.best_estimator_.steps[1][1]
#import pydot
import pyparsing

#reload(pydot)

In [None]:
import pprint, numpy
from operator import itemgetter

feat_importance = zip(list(numpy.array(feature_names)[numpy.nonzero(clf.feature_importances_)]), list(clf.feature_importances_[numpy.nonzero(clf.feature_importances_)]))
feat_importance = sorted(feat_importance, key=itemgetter(1))[::-1]
feat_importance
#for i in zip(list(numpy.array(feature_names)[numpy.nonzero(clf.feature_importances_)]), list(clf.feature_importances_([numpy.nonzero(clf.feature_importances_)]))):
#    i

In [None]:
>>> with open("iris.dot", 'w') as f:
    f = tree.export_graphviz(clf, out_file=f, feature_names=feature_names,
                         filled=True, rounded=True,  
                         special_characters=True)
#>>> import os
#>>> os.unlink('iris.dot')

In [None]:
>>> from sklearn.externals.six import StringIO
from sklearn import tree
import pydot
>>> from IPython.display import Image  
>>> dot_data = StringIO()  
>>> tree.export_graphviz(clf,  out_file=dot_data,
                         feature_names=feature_names,
                         filled=True, rounded=True,  
                         special_characters=True)  
>>> graph = pydot.graph_from_dot_data(dot_data.getvalue())  
#>>> Image(graph.create_png())   
>>> graph.write_pdf("iris.pdf") 