# Notebook para o PAN - Atribuição Autoral - 2018

In [1]:
%matplotlib inline
#python basic libs
from __future__ import print_function

from tempfile import mkdtemp
from shutil import rmtree
import os;
from os.path import join as pathjoin;

import re;
import glob;
import json;
import codecs;
from collections import defaultdict;
import pprint;


from pprint import pprint
from time import time
import logging


#data analysis libs
import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt;
import random;

#machine learning libs
#feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#preprocessing and transformation
from sklearn.preprocessing import normalize, MaxAbsScaler, MinMaxScaler;
from sklearn.preprocessing import LabelBinarizer;
from sklearn.decomposition import PCA;
from sklearn.metrics.pairwise import cosine_similarity;


from sklearn.base import BaseEstimator, ClassifierMixin

#classifiers
from sklearn.svm import LinearSVC, SVC
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.feature_selection import RFE,SelectFpr,SelectPercentile, chi2;

#
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

#model valuation
from sklearn.model_selection import train_test_split;
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score;


In [2]:
import seaborn as sns;
sns.set(color_codes=True);
from pandas.plotting import scatter_matrix

In [3]:
import platform; print(platform.platform())
print("NumPy", np.__version__)
import scipy; print("SciPy", scipy.__version__)
import sklearn; print("Scikit-Learn", sklearn.__version__)
print("seaborn", sns.__version__)

Darwin-17.5.0-x86_64-i386-64bit
NumPy 1.14.2
SciPy 1.0.1
Scikit-Learn 0.19.1
seaborn 0.8.1


### paths configuration

In [4]:
baseDir = '/Users/joseeleandrocustodio/Dropbox/mestrado/02 - Pesquisa/code';

inputDir= pathjoin(baseDir,'pan18aa');
outputDir= pathjoin(baseDir,'out',"oficial");
if not os.path.exists(outputDir):
    os.mkdir(outputDir);

## loading the dataset

In [5]:
def readCollectionsOfProblems(path):
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    with open(infocollection, 'r') as f:
        problems  = [
            {
                'problem': attrib['problem-name'],
                'language': attrib['language'],
                'encoding': attrib['encoding'],
            }
            for attrib in json.load(f)
            
        ]
    return problems;

In [6]:
problems = readCollectionsOfProblems(inputDir);

In [7]:
problems[0]

{'encoding': u'UTF-8', 'language': u'en', 'problem': u'problem00001'}

In [8]:
def readProblem(path, problem):
    # Reading information about the problem
    infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
    candidates = []
    with open(infoproblem, 'r') as f:
        fj = json.load(f)
        unk_folder = fj['unknown-folder']
        for attrib in fj['candidate-authors']:
            candidates.append(attrib['author-name'])
    return unk_folder, candidates;

In [9]:
def read_files(path,label):
    # Reads all text files located in the 'path' and assigns them to 'label' class
    files = glob.glob(pathjoin(path,label,'*.txt'))
    texts=[]
    for i,v in enumerate(files):
        f=codecs.open(v,'r',encoding='utf-8')
        texts.append((f.read(),label, os.path.basename(v)))
        f.close()
    return texts

In [10]:
for index,problem in enumerate(problems):
    unk_folder, candidates_folder = readProblem(inputDir, problem['problem']); 
    problem['candidates_folder_count'] = len(candidates_folder);
    problem['candidates'] = [];
    for candidate in candidates_folder:
        problem['candidates'].extend(read_files(pathjoin(inputDir, problem['problem']),candidate));
    
    problem['unknown'] = read_files(pathjoin(inputDir, problem['problem']),unk_folder);    

In [11]:
pd.DataFrame(problems)

Unnamed: 0,candidates,candidates_folder_count,encoding,language,problem,unknown
0,"[(graceful ones.\n\n""One more,"" Marvelous said...",20,UTF-8,en,problem00001,"[(after all, his best friends. And what in the..."
1,"[(a mission.""\n\nJensen just raises an eyebrow...",5,UTF-8,en,problem00002,"[(“Potter was attractive,” Draco thought, sigh..."
2,[(qui l'avait tué mais tout était de la faute ...,20,UTF-8,fr,problem00003,[(son réveil. Sa main pulse et Draco frotte l'...
3,[(. Le canapé est vide et lorsqu'il passe deva...,5,UTF-8,fr,problem00004,"[(abasourdie.\n\nTout d'abord, elle crut que s..."
4,"[(Eppure lui la mappa l’aveva stampata, dannaz...",20,UTF-8,it,problem00005,[(– Oh. Cazzo.\nSirius era così sconvolto che ...
5,[(Yato ha trovato una lettera sul suo comodino...,5,UTF-8,it,problem00006,"[(così la tua vista, Moony?\n– Cercavo di esse..."
6,[(zmienił zdanie. Niech się stworzonko pobawi....,20,UTF-8,pl,problem00007,"[(dawniej pełna radości i ciepła, a teraz wiec..."
7,"[(Słowem, które Sherlock najczęściej słyszał w...",5,UTF-8,pl,problem00008,"[(, uderzającego o żebra niczym dzwon- niemal ..."
8,[(pero no lo ama como ama a Guignol –explicó e...,20,UTF-8,sp,problem00009,[(–La nariz puntiaguda del elfo casi rozaba el...
9,"[(incapaz de señalar un momento exacto, un pun...",5,UTF-8,sp,problem00010,[(tan parecidas hizo que su trasero latiese de...


In [12]:
#*******************************************************************************************************
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder


def eval_measures(gt, pred):
    """Compute macro-averaged F1-scores, macro-averaged precision, 
    macro-averaged recall, and micro-averaged accuracy according the ad hoc
    rules discussed at the top of this file.
    Parameters
    ----------
    gt : dict
        Ground truth, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    pred : dict
        Predicted attribution, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    Returns
    -------
    f1 : float
        Macro-averaged F1-score
    precision : float
        Macro-averaged precision
    recall : float
        Macro-averaged recall
    accuracy : float
        Micro-averaged F1-score
    """

    actual_authors = list(gt.values())
    encoder = LabelEncoder().fit(['<UNK>'] + actual_authors)

    text_ids, gold_authors, silver_authors = [], [], []
    for text_id in sorted(gt):
        text_ids.append(text_id)
        gold_authors.append(gt[text_id])
        try:
            silver_authors.append(pred[text_id])
        except KeyError:
            # missing attributions get <UNK>:
            silver_authors.append('<UNK>')

    assert len(text_ids) == len(gold_authors)
    assert len(text_ids) == len(silver_authors)

    # replace non-existent silver authors with '<UNK>':
    silver_authors = [a if a in encoder.classes_ else '<UNK>' 
                      for a in silver_authors]

    gold_author_ints   = encoder.transform(gold_authors)
    silver_author_ints = encoder.transform(silver_authors)

    # get F1 for individual classes (and suppress warnings):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        f1 = f1_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        precision = precision_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        recall = recall_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        accuracy = accuracy_score(gold_author_ints,
                  silver_author_ints)

    return f1,precision,recall,accuracy


In [13]:
def evaluate(ground_truth_file,predictions_file):
    # Calculates evaluation measures for a single attribution problem
    gt = {}
    with open(ground_truth_file, 'r') as f:
        for attrib in json.load(f)['ground_truth']:
            gt[attrib['unknown-text']] = attrib['true-author']

    pred = {}
    with open(predictions_file, 'r') as f:
        for attrib in json.load(f):
            if attrib['unknown-text'] not in pred:
                pred[attrib['unknown-text']] = attrib['predicted-author']
    f1,precision,recall,accuracy =  eval_measures(gt,pred)
    return f1, precision, recall, accuracy

In [14]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse


class DenseTransformer(BaseEstimator):
    """Convert a sparse array into a dense array."""

    def __init__(self, return_copy=True):
        self.return_copy = return_copy
        self.is_fitted = False

    def transform(self, X, y=None):
        """ Return a dense version of the input array.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] (default: None)
        Returns
        ---------
        X_dense : dense version of the input X array.
        """
        if issparse(X):
            return X.toarray()
        elif self.return_copy:
            return X.copy()
        else:
            return X

    def fit(self, X, y=None):
        """ Mock method. Does nothing.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] (default: None)
        Returns
        ---------
        self
        """
        self.is_fitted = True
        return self

    def fit_transform(self, X, y=None):
        """ Return a dense version of the input array.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] (default: None)
        Returns
        ---------
        X_dense : dense version of the input X array.
        """
        return self.transform(X=X, y=y)

### examinando o parametro min_df isoladamente

In [15]:
def runML(problem):
    print ("\nProblem: %s,  language: %s, " %(problem['problem'],problem['language']))
    
    train_docs, train_labels, _   = zip(*problem['candidates'])
    problem['training_docs_size'] = len(train_docs);
    test_docs, _, test_filename   = zip(*problem['unknown'])
    
    pipeline = Pipeline([
        ('vect',   TfidfVectorizer(analyzer='char',
                                   min_df=0.05,
                                   max_df=1.0,
                                   norm='l1',
                                   ngram_range=(3,5),
                                   sublinear_tf=True,
                                   smooth_idf=True,
                                   lowercase =False)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.999)),
        ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ])
    
    
    # uncommenting more parameters will give better exploring power but will
    # increase processing time in a combinatorial way
    parameters = {
        'vect__min_df':(2,0.01,0.05,0.1)
    }
    
    grid_search = GridSearchCV(pipeline,
                               parameters,
                               cv=5,
                               n_jobs=-1,
                               verbose=False,
                               scoring='f1_macro')
    
    print("Performing grid search...")
    t0 = time()
    grid_search.fit(train_docs, train_labels)
    print("done in %0.3fs" % (time() - t0))

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    train_pred=grid_search.predict(train_docs);
    test_pred=grid_search.predict(test_docs);
    
    
    # Writing output file
    out_data=[]
    for i,v in enumerate(test_pred):
        out_data.append({'unknown-text': test_filename[i],'predicted-author': v})
    answerFile = pathjoin(outputDir,'answers-'+problem['problem']+'.json');
    with open(answerFile, 'w') as f:
        json.dump(out_data, f, indent=4)
    
    
    #evaluation train
    f1,precision,recall,accuracy=evaluate(
                pathjoin(inputDir, problem['problem'], 'ground-truth.json'),
                answerFile)
    
    return {
                'problem-name'  :       problem['problem'],
                "language"      :       problem['language'],
                'AuthorCount'   :       len(set(train_labels)),
                "train_doc_size":       len(train_docs),
                "train_caract_per_doc": sum([len(l) for l in train_docs])/len(train_docs),
                "test_doc_size" :       len(test_docs),
                "test_caract_per_doc":  sum([len(l) for l in test_docs])/len(test_docs),
                
                'macro-f1'       : round(f1,3),
                'macro-precision': round(precision,3),
                'macro-recall'   : round(recall,3),
                'micro-accuracy' : round(accuracy,3),
                
        }, grid_search.cv_results_, best_parameters;

In [189]:
result = [];
cv_result = [];
best_parameters = [];
for problem in problems:
    r, c, b = runML(problem);
    result.append(r);
    cv_result.append(c);
    b['problem'] = problem['problem'];
    best_parameters.append(b);


Problem: problem00001,  language: en, 
Performing grid search...
done in 38.968s
Best score: 0.769
Best parameters set:
	vect__min_df: 0.01

Problem: problem00002,  language: en, 
Performing grid search...
done in 29.814s
Best score: 0.874
Best parameters set:
	vect__min_df: 0.1

Problem: problem00003,  language: fr, 
Performing grid search...
done in 89.584s
Best score: 0.775
Best parameters set:
	vect__min_df: 0.01

Problem: problem00004,  language: fr, 
Performing grid search...
done in 31.481s
Best score: 0.903
Best parameters set:
	vect__min_df: 0.01

Problem: problem00005,  language: it, 
Performing grid search...
done in 91.047s
Best score: 0.743
Best parameters set:
	vect__min_df: 2

Problem: problem00006,  language: it, 
Performing grid search...
done in 33.172s
Best score: 0.970
Best parameters set:
	vect__min_df: 2

Problem: problem00007,  language: pl, 
Performing grid search...
done in 135.618s
Best score: 0.811
Best parameters set:
	vect__min_df: 0.01

Problem: problem00

In [190]:
pd.DataFrame(best_parameters)[['problem','vect__min_df']]

Unnamed: 0,problem,vect__min_df
0,problem00001,0.01
1,problem00002,0.1
2,problem00003,0.01
3,problem00004,0.01
4,problem00005,2.0
5,problem00006,2.0
6,problem00007,0.01
7,problem00008,0.1
8,problem00009,0.01
9,problem00010,2.0


### analisando os demais parametros

In [16]:
def runML(problem):
    print ("\nProblem: %s,  language: %s, " %(problem['problem'],problem['language']))
    
    train_docs, train_labels, _   = zip(*problem['candidates'])
    problem['training_docs_size'] = len(train_docs);
    test_docs, _, test_filename   = zip(*problem['unknown'])
    
    pipeline = Pipeline([
        ('vect',   TfidfVectorizer(analyzer='char',
                                   min_df=0.01,
                                   max_df=1.0,
                                   norm='l1',
                                   lowercase =False,
                                   sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA()),
        ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ])
    
    
    # uncommenting more parameters will give better exploring power but will
    # increase processing time in a combinatorial way
    parameters = {
        'vect__ngram_range':((2,3),(2,4),(2,5),(3,5)),
        'vect__sublinear_tf':(True, False),
        'vect__norm':('l1','l2'),
        'transf__n_components': (0.1,0.25,0.5,0.75,0.9,0.99),
    }
    
    grid_search = GridSearchCV(pipeline,
                               parameters,
                               cv=3,
                               n_jobs=-1,
                               verbose=False,
                               scoring='f1_macro')
    
    print("Performing grid search...")
    t0 = time()
    grid_search.fit(train_docs, train_labels)
    print("done in %0.3fs" % (time() - t0))

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    train_pred=grid_search.predict(train_docs);
    test_pred=grid_search.predict(test_docs);
    
    
    # Writing output file
    out_data=[]
    for i,v in enumerate(test_pred):
        out_data.append({'unknown-text': test_filename[i],'predicted-author': v})
    answerFile = pathjoin(outputDir,'answers-'+problem['problem']+'.json');
    with open(answerFile, 'w') as f:
        json.dump(out_data, f, indent=4)
    
    
    #evaluation train
    f1,precision,recall,accuracy=evaluate(
                pathjoin(inputDir, problem['problem'], 'ground-truth.json'),
                answerFile)
    
    return {
                'problem-name'  :       problem['problem'],
                "language"      :       problem['language'],
                'AuthorCount'   :       len(set(train_labels)),
                "train_doc_size":       len(train_docs),
                "train_caract_per_doc": sum([len(l) for l in train_docs])/len(train_docs),
                "test_doc_size" :       len(test_docs),
                "test_caract_per_doc":  sum([len(l) for l in test_docs])/len(test_docs),
                
                'macro-f1'       : round(f1,3),
                'macro-precision': round(precision,3),
                'macro-recall'   : round(recall,3),
                'micro-accuracy' : round(accuracy,3),
                
        }, grid_search.cv_results_,best_parameters;

In [485]:
result = [];
cv_result = [];
best_parameters = [];
for problem in problems:
    r, c, b = runML(problem);
    result.append(r);
    cv_result.append(c);
    b['problem'] = problem['problem'];
    best_parameters.append(b);


Problem: problem00001,  language: en, 
Performing grid search...
done in 658.937s
Best score: 0.833
Best parameters set:
	transf__n_components: 0.99
	vect__ngram_range: (2, 5)
	vect__norm: 'l1'
	vect__sublinear_tf: True

Problem: problem00002,  language: en, 
Performing grid search...
done in 123.030s
Best score: 0.971
Best parameters set:
	transf__n_components: 0.75
	vect__ngram_range: (2, 4)
	vect__norm: 'l1'
	vect__sublinear_tf: True

Problem: problem00003,  language: fr, 
Performing grid search...
done in 675.776s
Best score: 0.800
Best parameters set:
	transf__n_components: 0.99
	vect__ngram_range: (2, 3)
	vect__norm: 'l1'
	vect__sublinear_tf: False

Problem: problem00004,  language: fr, 
Performing grid search...
done in 143.218s
Best score: 0.854
Best parameters set:
	transf__n_components: 0.75
	vect__ngram_range: (2, 4)
	vect__norm: 'l2'
	vect__sublinear_tf: True

Problem: problem00005,  language: it, 
Performing grid search...
done in 837.817s
Best score: 0.701
Best parameter

In [486]:
df=pd.DataFrame(result)[['problem-name',
                     "language",
                     'AuthorCount',
                     "train_doc_size","train_caract_per_doc",
                     "test_doc_size", "test_caract_per_doc",
                     'macro-f1','macro-precision','macro-recall' ,'micro-accuracy']]

In [487]:
df

Unnamed: 0,problem-name,language,AuthorCount,train_doc_size,train_caract_per_doc,test_doc_size,test_caract_per_doc,macro-f1,macro-precision,macro-recall,micro-accuracy
0,problem00001,en,20,140,4327,105,4370,0.643,0.649,0.759,0.676
1,problem00002,en,5,35,4342,21,4296,0.477,0.517,0.517,0.381
2,problem00003,fr,20,140,4492,49,4508,0.641,0.667,0.709,0.653
3,problem00004,fr,5,35,4522,21,4532,0.747,0.767,0.8,0.667
4,problem00005,it,20,140,4720,80,4787,0.481,0.483,0.601,0.6
5,problem00006,it,5,35,4847,46,4765,0.596,0.596,0.697,0.826
6,problem00007,pl,20,140,5145,103,5200,0.465,0.49,0.529,0.534
7,problem00008,pl,5,35,5049,15,5214,0.822,0.8,0.878,0.867
8,problem00009,sp,20,140,4794,117,4788,0.787,0.788,0.858,0.803
9,problem00010,sp,5,35,4955,64,4827,0.832,0.857,0.823,0.875


In [488]:
print(df[["macro-f1"]].reset_index().to_latex(index=False).replace("     "," "))

\begin{tabular}{rr}
\toprule
 index &  macro-f1 \\
\midrule
 0 & 0.643 \\
 1 & 0.477 \\
 2 & 0.641 \\
 3 & 0.747 \\
 4 & 0.481 \\
 5 & 0.596 \\
 6 & 0.465 \\
 7 & 0.822 \\
 8 & 0.787 \\
 9 & 0.832 \\
\bottomrule
\end{tabular}



In [489]:
languages={
    'en':'inglesa',
    'sp':'espanhola',
    'it':'italiana',
    'pl':'polonesa',
    'fr':'francesa'
}

In [492]:
cv_result2 = [];
dfCV = pd.DataFrame();
for i, c in enumerate(cv_result):
    temp = pd.DataFrame(c);
    temp['problem'] = i+1;
    temp['language'] = languages[problems[i]['language']]
    dfCV = dfCV.append(temp);

for p in ['param_transf__n_components',
    'mean_test_score','std_test_score','mean_train_score',   
    'split0_test_score','split0_train_score',
    'split1_test_score','split1_train_score',
    'split2_test_score','split2_train_score']:
    dfCV[p]=dfCV[p].astype(np.float32);

    
dfCV =dfCV[[
    'problem',
    'language',
    'rank_test_score',
    'param_transf__n_components',
    'param_vect__ngram_range',
    'param_vect__sublinear_tf',
    'param_vect__norm',
    'mean_test_score',   
    'std_test_score',
    'mean_train_score',   

    'split0_test_score','split0_train_score',
    'split1_test_score','split1_train_score',
    'split2_test_score','split2_train_score',

    'mean_score_time',
    'mean_fit_time',
    'std_fit_time',
    'std_score_time',
    'std_train_score',
]];

dfCV.rename(columns={
    'param_transf__n_components':'PCA_componentes',
    'param_vect__ngram_range':'ngram_range',
    'param_vect__sublinear_tf':'sublinear_tf',
    'param_vect__smooth_idf':'smooth_idf',
    'param_vect__norm':'norm'
},inplace=True);

#print('\',\n\''.join(dfCV.columns))

In [493]:
dfCV.to_csv('PANAA2018_CHAR.csv', index=False)

In [17]:
dfCV = pd.read_csv('PANAA2018_CHAR.csv', na_values='')

In [18]:
(dfCV[dfCV.rank_test_score == 1])[
    ['problem',
     'language',
    'rank_test_score',
    'mean_test_score',
    'std_test_score',
    'ngram_range',
    'sublinear_tf',
     'norm',
    'PCA_componentes']
].sort_values(by=[
    'problem',
    'mean_test_score',
    'ngram_range',
    'sublinear_tf',
    'PCA_componentes'
], ascending=[True, False,False,False,False])

Unnamed: 0,problem,language,rank_test_score,mean_test_score,std_test_score,ngram_range,sublinear_tf,norm,PCA_componentes
88,1,inglesa,1,0.833299,0.020391,"(2, 5)",True,l1,0.99
188,2,inglesa,1,0.970612,0.033934,"(3, 5)",True,l1,0.99
190,2,inglesa,1,0.970612,0.033934,"(3, 5)",True,l2,0.99
189,2,inglesa,1,0.970612,0.033934,"(3, 5)",False,l1,0.99
184,2,inglesa,1,0.970612,0.033934,"(2, 5)",True,l1,0.99
186,2,inglesa,1,0.970612,0.033934,"(2, 5)",True,l2,0.99
180,2,inglesa,1,0.970612,0.033934,"(2, 4)",True,l1,0.99
182,2,inglesa,1,0.970612,0.033934,"(2, 4)",True,l2,0.99
164,2,inglesa,1,0.970612,0.033934,"(2, 4)",True,l1,0.9
148,2,inglesa,1,0.970612,0.033934,"(2, 4)",True,l1,0.75


In [19]:
dfCV.pivot_table(
            index=['problem','language','PCA_componentes'],
            columns=['norm','sublinear_tf', 'ngram_range'],
            values='mean_test_score'
        )

Unnamed: 0_level_0,Unnamed: 1_level_0,norm,l1,l1,l1,l1,l1,l1,l1,l1,l2,l2,l2,l2,l2,l2,l2,l2
Unnamed: 0_level_1,Unnamed: 1_level_1,sublinear_tf,False,False,False,False,True,True,True,True,False,False,False,False,True,True,True,True
Unnamed: 0_level_2,Unnamed: 1_level_2,ngram_range,"(2, 3)","(2, 4)","(2, 5)","(3, 5)","(2, 3)","(2, 4)","(2, 5)","(3, 5)","(2, 3)","(2, 4)","(2, 5)","(3, 5)","(2, 3)","(2, 4)","(2, 5)","(3, 5)"
problem,language,PCA_componentes,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3
1,inglesa,0.1,0.28905,0.213501,0.135221,0.144955,0.247157,0.202389,0.111574,0.105737,0.261712,0.147346,0.157838,0.142236,0.225762,0.201633,0.17509,0.155582
1,inglesa,0.25,0.404768,0.475246,0.473339,0.432229,0.48621,0.566849,0.525181,0.573929,0.35645,0.344444,0.244823,0.245465,0.469694,0.452204,0.433352,0.397762
1,inglesa,0.5,0.587794,0.633503,0.690323,0.670391,0.709813,0.784048,0.788146,0.789694,0.530153,0.415823,0.34653,0.319832,0.697823,0.729201,0.706701,0.723929
1,inglesa,0.75,0.640255,0.734388,0.767228,0.75966,0.742721,0.792092,0.790527,0.792568,0.591655,0.600161,0.603951,0.493628,0.735527,0.77352,0.781173,0.74784
1,inglesa,0.9,0.668231,0.783061,0.811412,0.795527,0.772653,0.799745,0.813895,0.806548,0.650258,0.718389,0.614846,0.588564,0.794252,0.817925,0.792619,0.794422
1,inglesa,0.99,0.680153,0.78665,0.803554,0.80335,0.791224,0.826905,0.833299,0.826718,0.669558,0.759524,0.714898,0.734116,0.816037,0.817925,0.818844,0.826463
2,inglesa,0.1,0.484966,0.406753,0.35932,0.316871,0.380245,0.357388,0.178926,0.177547,0.357619,0.321465,0.17119,0.207446,0.380245,0.321991,0.275873,0.253968
2,inglesa,0.25,0.760476,0.564762,0.518095,0.487619,0.707143,0.648095,0.681905,0.611787,0.635374,0.43873,0.42854,0.339048,0.722313,0.649932,0.491198,0.445121
2,inglesa,0.5,0.885238,0.824286,0.802993,0.802993,0.885238,0.831837,0.852517,0.852517,0.854762,0.738231,0.643537,0.553878,0.824286,0.879184,0.810884,0.839456
2,inglesa,0.75,0.883333,0.852857,0.882993,0.882993,0.90966,0.970612,0.940136,0.940136,0.795714,0.784422,0.747279,0.492064,0.90966,0.90966,0.845034,0.882993


In [20]:
pd.options.display.precision = 3  
print(u"\\begin{table}[h]\n\\centering\n\\caption{Medida F1 para os parâmetros }")

print(re.sub(r'[ ]{2,}',' ',dfCV[dfCV.PCA_componentes >= 0.99].pivot_table(
        index=['problem','language','sublinear_tf','norm'],
        columns=['ngram_range'],
        values='mean_test_score'
    ).to_latex()))
print ("\label{tab:modelocaracter}")
print(r"\end{table}")

\begin{table}[h]
\centering
\caption{Medida F1 para os parâmetros }
\begin{tabular}{llllrrrr}
\toprule
 & & & ngram\_range & (2, 3) & (2, 4) & (2, 5) & (3, 5) \\
problem & language & sublinear\_tf & norm & & & & \\
\midrule
1 & inglesa & False & l1 & 0.680 & 0.787 & 0.804 & 0.803 \\
 & & & l2 & 0.670 & 0.760 & 0.715 & 0.734 \\
 & & True & l1 & 0.791 & 0.827 & 0.833 & 0.827 \\
 & & & l2 & 0.816 & 0.818 & 0.819 & 0.826 \\
2 & inglesa & False & l1 & 0.883 & 0.940 & 0.940 & 0.971 \\
 & & & l2 & 0.883 & 0.879 & 0.940 & 0.940 \\
 & & True & l1 & 0.883 & 0.971 & 0.971 & 0.971 \\
 & & & l2 & 0.910 & 0.971 & 0.971 & 0.971 \\
3 & francesa & False & l1 & 0.800 & 0.782 & 0.772 & 0.772 \\
 & & & l2 & 0.794 & 0.761 & 0.732 & 0.724 \\
 & & True & l1 & 0.778 & 0.788 & 0.762 & 0.775 \\
 & & & l2 & 0.786 & 0.769 & 0.763 & 0.776 \\
4 & francesa & False & l1 & 0.775 & 0.800 & 0.825 & 0.825 \\
 & & & l2 & 0.744 & 0.800 & 0.854 & 0.854 \\
 & & True & l1 & 0.744 & 0.800 & 0.854 & 0.854 \\
 & & & l2 & 0.799 &