# Notebook para o PAN - Atribuição Autoral - 2018

In [2]:
%matplotlib inline
#python basic libs
from __future__ import print_function

from tempfile import mkdtemp
from shutil import rmtree
import os;
from os.path import join as pathjoin;

import re;
import glob;
import json;
import codecs;
from collections import defaultdict;
import pprint;


from pprint import pprint
from time import time
import logging


#data analysis libs
import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt;
import random;

#machine learning libs
#feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#preprocessing and transformation
from sklearn.preprocessing import normalize, MaxAbsScaler, MinMaxScaler;
from sklearn.preprocessing import LabelBinarizer;
from sklearn.decomposition import PCA;
from sklearn.metrics.pairwise import cosine_similarity;


from sklearn.base import BaseEstimator, ClassifierMixin

#classifiers
from sklearn.svm import LinearSVC, SVC
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.feature_selection import RFE,SelectFpr,SelectPercentile, chi2;

#
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

#model valuation
from sklearn.model_selection import train_test_split;
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score;



In [3]:
import seaborn as sns;
from pandas.plotting import scatter_matrix

In [4]:
import platform; print(platform.platform())
print("NumPy", np.__version__)
import scipy; print("SciPy", scipy.__version__)
import sklearn; print("Scikit-Learn", sklearn.__version__)

Darwin-17.5.0-x86_64-i386-64bit
NumPy 1.14.2
SciPy 1.0.1
Scikit-Learn 0.19.1


### paths configuration

In [5]:
baseDir = '/Users/joseeleandrocustodio/Dropbox/mestrado/02 - Pesquisa/code';

inputDir= pathjoin(baseDir,'pan18aa');
outputDir= pathjoin(baseDir,'out',"oficial");
if not os.path.exists(outputDir):
    os.mkdir(outputDir);

## loading the dataset

In [6]:
def readCollectionsOfProblems(path):
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    with open(infocollection, 'r') as f:
        problems  = [
            {
                'problem': attrib['problem-name'],
                'language': attrib['language'],
                'encoding': attrib['encoding'],
            }
            for attrib in json.load(f)
            
        ]
    return problems;

In [7]:
problems = readCollectionsOfProblems(inputDir);

In [8]:
problems[0]

{'encoding': u'UTF-8', 'language': u'en', 'problem': u'problem00001'}

In [9]:
def readProblem(path, problem):
    # Reading information about the problem
    infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
    candidates = []
    with open(infoproblem, 'r') as f:
        fj = json.load(f)
        unk_folder = fj['unknown-folder']
        for attrib in fj['candidate-authors']:
            candidates.append(attrib['author-name'])
    return unk_folder, candidates;

In [10]:
def read_files(path,label):
    # Reads all text files located in the 'path' and assigns them to 'label' class
    files = glob.glob(pathjoin(path,label,'*.txt'))
    texts=[]
    for i,v in enumerate(files):
        f=codecs.open(v,'r',encoding='utf-8')
        texts.append((f.read(),label, os.path.basename(v)))
        f.close()
    return texts

In [11]:
for index,problem in enumerate(problems):
    unk_folder, candidates_folder = readProblem(inputDir, problem['problem']); 
    problem['candidates_folder_count'] = len(candidates_folder);
    problem['candidates'] = [];
    for candidate in candidates_folder:
        problem['candidates'].extend(read_files(pathjoin(inputDir, problem['problem']),candidate));
    
    problem['unknown'] = read_files(pathjoin(inputDir, problem['problem']),unk_folder);    

In [12]:
pd.DataFrame(problems)

Unnamed: 0,candidates,candidates_folder_count,encoding,language,problem,unknown
0,"[(graceful ones.\n\n""One more,"" Marvelous said...",20,UTF-8,en,problem00001,"[(after all, his best friends. And what in the..."
1,"[(a mission.""\n\nJensen just raises an eyebrow...",5,UTF-8,en,problem00002,"[(“Potter was attractive,” Draco thought, sigh..."
2,[(qui l'avait tué mais tout était de la faute ...,20,UTF-8,fr,problem00003,[(son réveil. Sa main pulse et Draco frotte l'...
3,[(. Le canapé est vide et lorsqu'il passe deva...,5,UTF-8,fr,problem00004,"[(abasourdie.\n\nTout d'abord, elle crut que s..."
4,"[(Eppure lui la mappa l’aveva stampata, dannaz...",20,UTF-8,it,problem00005,[(– Oh. Cazzo.\nSirius era così sconvolto che ...
5,[(Yato ha trovato una lettera sul suo comodino...,5,UTF-8,it,problem00006,"[(così la tua vista, Moony?\n– Cercavo di esse..."
6,[(zmienił zdanie. Niech się stworzonko pobawi....,20,UTF-8,pl,problem00007,"[(dawniej pełna radości i ciepła, a teraz wiec..."
7,"[(Słowem, które Sherlock najczęściej słyszał w...",5,UTF-8,pl,problem00008,"[(, uderzającego o żebra niczym dzwon- niemal ..."
8,[(pero no lo ama como ama a Guignol –explicó e...,20,UTF-8,sp,problem00009,[(–La nariz puntiaguda del elfo casi rozaba el...
9,"[(incapaz de señalar un momento exacto, un pun...",5,UTF-8,sp,problem00010,[(tan parecidas hizo que su trasero latiese de...


In [13]:
#*******************************************************************************************************
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder


def eval_measures(gt, pred):
    """Compute macro-averaged F1-scores, macro-averaged precision, 
    macro-averaged recall, and micro-averaged accuracy according the ad hoc
    rules discussed at the top of this file.
    Parameters
    ----------
    gt : dict
        Ground truth, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    pred : dict
        Predicted attribution, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    Returns
    -------
    f1 : float
        Macro-averaged F1-score
    precision : float
        Macro-averaged precision
    recall : float
        Macro-averaged recall
    accuracy : float
        Micro-averaged F1-score
    """

    actual_authors = list(gt.values())
    encoder = LabelEncoder().fit(['<UNK>'] + actual_authors)

    text_ids, gold_authors, silver_authors = [], [], []
    for text_id in sorted(gt):
        text_ids.append(text_id)
        gold_authors.append(gt[text_id])
        try:
            silver_authors.append(pred[text_id])
        except KeyError:
            # missing attributions get <UNK>:
            silver_authors.append('<UNK>')

    assert len(text_ids) == len(gold_authors)
    assert len(text_ids) == len(silver_authors)

    # replace non-existent silver authors with '<UNK>':
    silver_authors = [a if a in encoder.classes_ else '<UNK>' 
                      for a in silver_authors]

    gold_author_ints   = encoder.transform(gold_authors)
    silver_author_ints = encoder.transform(silver_authors)

    # get F1 for individual classes (and suppress warnings):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        f1 = f1_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        precision = precision_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        recall = recall_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        accuracy = accuracy_score(gold_author_ints,
                  silver_author_ints)

    return f1,precision,recall,accuracy


In [14]:
def evaluate(ground_truth_file,predictions_file):
    # Calculates evaluation measures for a single attribution problem
    gt = {}
    with open(ground_truth_file, 'r') as f:
        for attrib in json.load(f)['ground_truth']:
            gt[attrib['unknown-text']] = attrib['true-author']

    pred = {}
    with open(predictions_file, 'r') as f:
        for attrib in json.load(f):
            if attrib['unknown-text'] not in pred:
                pred[attrib['unknown-text']] = attrib['predicted-author']
    f1,precision,recall,accuracy =  eval_measures(gt,pred)
    return f1, precision, recall, accuracy

In [15]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse


class DenseTransformer(BaseEstimator):
    """Convert a sparse array into a dense array."""

    def __init__(self, return_copy=True):
        self.return_copy = return_copy
        self.is_fitted = False

    def transform(self, X, y=None):
        """ Return a dense version of the input array.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] (default: None)
        Returns
        ---------
        X_dense : dense version of the input X array.
        """
        if issparse(X):
            return X.toarray()
        elif self.return_copy:
            return X.copy()
        else:
            return X

    def fit(self, X, y=None):
        """ Mock method. Does nothing.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] (default: None)
        Returns
        ---------
        self
        """
        self.is_fitted = True
        return self

    def fit_transform(self, X, y=None):
        """ Return a dense version of the input array.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] (default: None)
        Returns
        ---------
        X_dense : dense version of the input X array.
        """
        return self.transform(X=X, y=y)

In [16]:
from sklearn.base import BaseEstimator
from scipy.sparse import issparse


class ObfuscationTransformer(BaseEstimator):
    def __init__(self,re_from=r'(\b)(\w{0,2})\w+(\w{1,3})(\b)', re_to=r'\1\2XX\3\4', return_copy=True):
        self.re_from = re_from
        self.re_to = re_to

    def transform(self, X, y=None):
        X = np.array(X).copy();
        for i in range(len(X)):
            X[i] = re.sub(self.re_from,self.re_to, X[i])
        
        return X;

    def fit(self, X, y=None):
        return self

    def fit_transform(self, X, y=None):
        return self.transform(X=X, y=y)

### examinando o parametro min_df isoladamente

In [19]:
def runML(problem):
    print ("\nProblem: %s,  language: %s, " %(problem['problem'],problem['language']))
    
    train_docs, train_labels, _   = zip(*problem['candidates'])
    problem['training_docs_size'] = len(train_docs);
    test_docs, _, test_filename   = zip(*problem['unknown'])
    
    pipeline = Pipeline([
        ('obs',ObfuscationTransformer(re_from=r'\w',re_to='x')),
        ('vect',   TfidfVectorizer(analyzer='char',
                                   min_df=0.01,
                                   max_df=1.0,
                                   norm='l1',
                                   ngram_range=(3,5),
                                   sublinear_tf=True,
                                   smooth_idf=True,
                                   lowercase =False)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA(0.999)),
        ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ])
    
    
    # uncommenting more parameters will give better exploring power but will
    # increase processing time in a combinatorial way
    parameters = {
        'vect__min_df':(2,0.01,0.05,0.1)
    }
    
    grid_search = GridSearchCV(pipeline,
                               parameters,
                               cv=5,
                               n_jobs=-1,
                               verbose=False,
                               scoring='f1_macro')
    
    print("Performing grid search...")
    t0 = time()
    grid_search.fit(train_docs, train_labels)
    print("done in %0.3fs" % (time() - t0))

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    train_pred=grid_search.predict(train_docs);
    test_pred=grid_search.predict(test_docs);
    
    
    # Writing output file
    out_data=[]
    for i,v in enumerate(test_pred):
        out_data.append({'unknown-text': test_filename[i],'predicted-author': v})
    answerFile = pathjoin(outputDir,'answers-'+problem['problem']+'.json');
    with open(answerFile, 'w') as f:
        json.dump(out_data, f, indent=4)
    
    
    #evaluation train
    f1,precision,recall,accuracy=evaluate(
                pathjoin(inputDir, problem['problem'], 'ground-truth.json'),
                answerFile)
    
    return {
                'problem-name'  :       problem['problem'],
                "language"      :       problem['language'],
                'AuthorCount'   :       len(set(train_labels)),
                "train_doc_size":       len(train_docs),
                "train_caract_per_doc": sum([len(l) for l in train_docs])/len(train_docs),
                "test_doc_size" :       len(test_docs),
                "test_caract_per_doc":  sum([len(l) for l in test_docs])/len(test_docs),
                
                'macro-f1'       : round(f1,3),
                'macro-precision': round(precision,3),
                'macro-recall'   : round(recall,3),
                'micro-accuracy' : round(accuracy,3),
                
        }, grid_search.cv_results_, best_parameters;

In [20]:
result = [];
cv_result = [];
best_parameters = [];
for problem in problems:
    r, c, b = runML(problem);
    result.append(r);
    cv_result.append(c);
    b['problem'] = problem['problem'];
    best_parameters.append(b);


Problem: problem00001,  language: en, 
Performing grid search...


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


done in 26.488s
Best score: 0.612
Best parameters set:
	vect__min_df: 0.01

Problem: problem00002,  language: en, 
Performing grid search...
done in 7.856s
Best score: 0.871
Best parameters set:
	vect__min_df: 2

Problem: problem00003,  language: fr, 
Performing grid search...
done in 31.069s
Best score: 0.651
Best parameters set:
	vect__min_df: 2

Problem: problem00004,  language: fr, 
Performing grid search...
done in 8.112s
Best score: 0.570
Best parameters set:
	vect__min_df: 0.01

Problem: problem00005,  language: it, 
Performing grid search...
done in 35.381s
Best score: 0.651
Best parameters set:
	vect__min_df: 2

Problem: problem00006,  language: it, 
Performing grid search...
done in 8.401s
Best score: 0.796
Best parameters set:
	vect__min_df: 0.1

Problem: problem00007,  language: pl, 
Performing grid search...
done in 40.481s
Best score: 0.756
Best parameters set:
	vect__min_df: 0.05

Problem: problem00008,  language: pl, 
Performing grid search...
done in 8.826s
Best score:

In [21]:
pd.DataFrame(best_parameters)[['problem','vect__min_df']]

Unnamed: 0,problem,vect__min_df
0,problem00001,0.01
1,problem00002,2.0
2,problem00003,2.0
3,problem00004,0.01
4,problem00005,2.0
5,problem00006,0.1
6,problem00007,0.05
7,problem00008,2.0
8,problem00009,0.01
9,problem00010,0.01


### analisando os demais parametros

In [36]:
def runML(problem):
    print ("\nProblem: %s,  language: %s, " %(problem['problem'],problem['language']))
    
    train_docs, train_labels, _   = zip(*problem['candidates'])
    problem['training_docs_size'] = len(train_docs);
    test_docs, _, test_filename   = zip(*problem['unknown'])
    
    pipeline = Pipeline([
        ('obs',ObfuscationTransformer(re_from=r'\w',re_to='x')),
        ('vect',   TfidfVectorizer(analyzer='char',
                                   min_df=0.01,
                                   max_df=1.0,
                                   norm='l1',
                                   lowercase =False,
                                   sublinear_tf=True)),
        ('dense',  DenseTransformer()),
        ('scaler', MaxAbsScaler()),
        ('transf', PCA()),
        ('clf', LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg')),
    ])
    
    
    # uncommenting more parameters will give better exploring power but will
    # increase processing time in a combinatorial way
    parameters = {
        'vect__ngram_range':((2,3),(2,4),(2,5),(3,5)),
        'vect__sublinear_tf':(True, False),
        'transf__n_components': (0.1,0.5,0.9,0.999),
    }
    
    grid_search = GridSearchCV(pipeline,
                               parameters,
                               cv=5,
                               n_jobs=-1,
                               verbose=False,
                               scoring='f1_macro')
    
    print("Performing grid search...")
    t0 = time()
    grid_search.fit(train_docs, train_labels)
    print("done in %0.3fs" % (time() - t0))

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    train_pred=grid_search.predict(train_docs);
    test_pred=grid_search.predict(test_docs);
    
    
    # Writing output file
    out_data=[]
    for i,v in enumerate(test_pred):
        out_data.append({'unknown-text': test_filename[i],'predicted-author': v})
    answerFile = pathjoin(outputDir,'answers-'+problem['problem']+'.json');
    with open(answerFile, 'w') as f:
        json.dump(out_data, f, indent=4)
    
    
    #evaluation train
    f1,precision,recall,accuracy=evaluate(
                pathjoin(inputDir, problem['problem'], 'ground-truth.json'),
                answerFile)
    
    return {
                'problem-name'  :       problem['problem'],
                "language"      :       problem['language'],
                'AuthorCount'   :       len(set(train_labels)),
                "train_doc_size":       len(train_docs),
                "train_caract_per_doc": sum([len(l) for l in train_docs])/len(train_docs),
                "test_doc_size" :       len(test_docs),
                "test_caract_per_doc":  sum([len(l) for l in test_docs])/len(test_docs),
                
                'macro-f1'       : round(f1,3),
                'macro-precision': round(precision,3),
                'macro-recall'   : round(recall,3),
                'micro-accuracy' : round(accuracy,3),
                
        }, grid_search.cv_results_,best_parameters;

In [37]:
result = [];
cv_result = [];
best_parameters = [];
for problem in problems:
    r, c, b = runML(problem);
    result.append(r);
    cv_result.append(c);
    b['problem'] = problem['problem'];
    best_parameters.append(b);


Problem: problem00001,  language: en, 
Performing grid search...
done in 209.749s
Best score: 0.637
Best parameters set:
	transf__n_components: 0.9
	vect__ngram_range: (2, 5)
	vect__sublinear_tf: True

Problem: problem00002,  language: en, 
Performing grid search...
done in 53.201s
Best score: 0.943
Best parameters set:
	transf__n_components: 0.999
	vect__ngram_range: (2, 3)
	vect__sublinear_tf: True

Problem: problem00003,  language: fr, 
Performing grid search...
done in 221.275s
Best score: 0.661
Best parameters set:
	transf__n_components: 0.999
	vect__ngram_range: (2, 4)
	vect__sublinear_tf: True

Problem: problem00004,  language: fr, 
Performing grid search...
done in 59.277s
Best score: 0.636
Best parameters set:
	transf__n_components: 0.999
	vect__ngram_range: (3, 5)
	vect__sublinear_tf: False

Problem: problem00005,  language: it, 
Performing grid search...
done in 254.150s
Best score: 0.650
Best parameters set:
	transf__n_components: 0.9
	vect__ngram_range: (2, 4)
	vect__subl

In [38]:
df=pd.DataFrame(result)[['problem-name',
                     "language",
                     'AuthorCount',
                     "train_doc_size","train_caract_per_doc",
                     "test_doc_size", "test_caract_per_doc",
                     'macro-f1','macro-precision','macro-recall' ,'micro-accuracy']]

In [39]:
df

Unnamed: 0,problem-name,language,AuthorCount,train_doc_size,train_caract_per_doc,test_doc_size,test_caract_per_doc,macro-f1,macro-precision,macro-recall,micro-accuracy
0,problem00001,en,20,140,4327,105,4370,0.457,0.437,0.627,0.562
1,problem00002,en,5,35,4342,21,4296,0.337,0.352,0.453,0.333
2,problem00003,fr,20,140,4492,49,4508,0.615,0.629,0.659,0.633
3,problem00004,fr,5,35,4522,21,4532,0.559,0.65,0.707,0.524
4,problem00005,it,20,140,4720,80,4787,0.454,0.413,0.589,0.625
5,problem00006,it,5,35,4847,46,4765,0.575,0.608,0.687,0.804
6,problem00007,pl,20,140,5145,103,5200,0.468,0.485,0.527,0.456
7,problem00008,pl,5,35,5049,15,5214,0.556,0.55,0.711,0.667
8,problem00009,sp,20,140,4794,117,4788,0.564,0.569,0.638,0.598
9,problem00010,sp,5,35,4955,64,4827,0.623,0.651,0.683,0.656


In [40]:
print(df[["macro-f1"]].reset_index().to_latex(index=False).replace("     "," "))

\begin{tabular}{rr}
\toprule
 index &  macro-f1 \\
\midrule
 0 & 0.457 \\
 1 & 0.337 \\
 2 & 0.615 \\
 3 & 0.559 \\
 4 & 0.454 \\
 5 & 0.575 \\
 6 & 0.468 \\
 7 & 0.556 \\
 8 & 0.564 \\
 9 & 0.623 \\
\bottomrule
\end{tabular}



In [41]:
languages={
    'en':'inglesa',
    'sp':'espanhola',
    'it':'italiana',
    'pl':'polonesa',
    'fr':'francesa'
}

In [42]:
cv_result2 = [];
dfCV = pd.DataFrame();
for i, c in enumerate(cv_result):
    temp = pd.DataFrame(c);
    temp['problem'] = i+1;
    temp['language'] = languages[problems[i]['language']]
    dfCV = dfCV.append(temp);

for p in ['param_transf__n_components',
    'mean_test_score','std_test_score','mean_train_score',   
    'split0_test_score','split0_train_score',
    'split1_test_score','split1_train_score',
    'split2_test_score','split2_train_score',
    'split3_test_score','split3_train_score',
    'split4_test_score','split4_train_score']:
    dfCV[p]=dfCV[p].astype(np.float32);

    
dfCV =dfCV[[
    'problem',
    'language',
    'rank_test_score',
    'param_transf__n_components',
    'param_vect__ngram_range',
    'param_vect__sublinear_tf',
    'mean_test_score',   
    'std_test_score',
    'mean_train_score',   

    'split0_test_score','split0_train_score',
    'split1_test_score','split1_train_score',
    'split2_test_score','split2_train_score',
    'split3_test_score','split3_train_score',
    'split4_test_score','split4_train_score',

    'mean_score_time',
    'mean_fit_time',
    'std_fit_time',
    'std_score_time',
    'std_train_score',
]];

dfCV.rename(columns={
    'param_transf__n_components':'PCA_componentes',
    'param_vect__ngram_range':'ngram_range',
    'param_vect__sublinear_tf':'sublinear_tf',
    'param_vect__smooth_idf':'smooth_idf'
},inplace=True);

#print('\',\n\''.join(dfCV.columns))

In [43]:
dfCV.to_csv('PANAA2018_MASK.csv', index=False)

In [44]:
(dfCV[dfCV.rank_test_score == 1])[
    ['problem',
     'language',
    'rank_test_score',
    'mean_test_score',
    'std_test_score',
    'ngram_range',
    'sublinear_tf',
    'PCA_componentes']
].sort_values(by=[
    'problem',
    'mean_test_score',
    'ngram_range',
    'sublinear_tf',
    'PCA_componentes'
], ascending=[True, False,False,False,False])

Unnamed: 0,problem,language,rank_test_score,mean_test_score,std_test_score,ngram_range,sublinear_tf,PCA_componentes
20,1,inglesa,1,0.637,0.094,"(2, 5)",True,0.9
24,2,inglesa,1,0.943,0.09,"(2, 3)",True,0.999
26,3,francesa,1,0.661,0.102,"(2, 4)",True,0.999
31,4,francesa,1,0.636,0.193,"(3, 5)",False,0.999
18,5,italiana,1,0.65,0.054,"(2, 4)",True,0.9
19,6,italiana,1,0.834,0.143,"(2, 4)",False,0.9
22,7,polonesa,1,0.754,0.085,"(3, 5)",True,0.9
30,8,polonesa,1,0.931,0.093,"(3, 5)",True,0.999
22,8,polonesa,1,0.931,0.093,"(3, 5)",True,0.9
31,8,polonesa,1,0.931,0.093,"(3, 5)",False,0.999


In [45]:
dfCV.pivot_table(
            index=['problem','language','PCA_componentes'],
            columns=['sublinear_tf', 'ngram_range'],
            values='mean_test_score'
        )

Unnamed: 0_level_0,Unnamed: 1_level_0,sublinear_tf,False,False,False,False,True,True,True,True
Unnamed: 0_level_1,Unnamed: 1_level_1,ngram_range,"(2, 3)","(2, 4)","(2, 5)","(3, 5)","(2, 3)","(2, 4)","(2, 5)","(3, 5)"
problem,language,PCA_componentes,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1,inglesa,0.1,0.094,0.093,0.088,0.089,0.077,0.082,0.099,0.103
1,inglesa,0.5,0.391,0.36,0.431,0.412,0.433,0.496,0.484,0.473
1,inglesa,0.9,0.526,0.56,0.566,0.553,0.6,0.619,0.637,0.582
1,inglesa,0.999,0.517,0.548,0.542,0.554,0.634,0.63,0.629,0.612
2,inglesa,0.1,0.371,0.333,0.266,0.295,0.41,0.394,0.363,0.363
2,inglesa,0.5,0.672,0.672,0.666,0.666,0.829,0.901,0.872,0.872
2,inglesa,0.9,0.741,0.789,0.685,0.618,0.901,0.874,0.874,0.874
2,inglesa,0.999,0.8,0.806,0.768,0.817,0.943,0.912,0.874,0.836
3,francesa,0.1,0.142,0.147,0.194,0.205,0.179,0.202,0.198,0.189
3,francesa,0.5,0.435,0.449,0.506,0.517,0.47,0.488,0.509,0.552


In [46]:
pd.options.display.precision = 3  
print(u"\\begin{table}[h]\n\\centering\n\\caption{Medida F1 para os parâmetros }")

print(dfCV.pivot_table(
        index=['problem','language','PCA_componentes'],
        columns=['sublinear_tf', 'ngram_range'],
        values='mean_test_score'
    ).to_latex().replace("     "," "))
print ("\label{tab:modeloPalavra}")
print(r"\end{table}")

\begin{table}[h]
\centering
\caption{Medida F1 para os parâmetros }
\begin{tabular}{lllrrrrrrrr}
\toprule
   &   & sublinear\_tf & \multicolumn{4}{l}{False} & \multicolumn{4}{l}{True} \\
   &   & ngram\_range & (2, 3) & (2, 4) & (2, 5) & (3, 5) & (2, 3) & (2, 4) & (2, 5) & (3, 5) \\
problem & language & PCA\_componentes &    &    &    &    &    &    &    &    \\
\midrule
1  & inglesa & 0.100 &  0.094 &  0.093 &  0.088 &  0.089 &  0.077 &  0.082 &  0.099 &  0.103 \\
   &   & 0.500 &  0.391 &  0.360 &  0.431 &  0.412 &  0.433 &  0.496 &  0.484 &  0.473 \\
   &   & 0.900 &  0.526 &  0.560 &  0.566 &  0.553 &  0.600 &  0.619 &  0.637 &  0.582 \\
   &   & 0.999 &  0.517 &  0.548 &  0.542 &  0.554 &  0.634 &  0.630 &  0.629 &  0.612 \\
2  & inglesa & 0.100 &  0.371 &  0.333 &  0.266 &  0.295 &  0.410 &  0.394 &  0.363 &  0.363 \\
   &   & 0.500 &  0.672 &  0.672 &  0.666 &  0.666 &  0.829 &  0.901 &  0.872 &  0.872 \\
   &   & 0.900 &  0.741 &  0.789 &  0.685 &  0.618 &  0.901 &  0.874 &  0.

In [47]:


dfCV[(dfCV.language=='it') & (dfCV.sublinear_tf==True)].plot(x='PCA_componentes',y='mean_test_score', kind='scatter')
dfCV[(dfCV.language=='it')& (dfCV.sublinear_tf==False)].plot(x='PCA_componentes',y='mean_test_score', kind='scatter')


ValueError: scatter requires x column to be numeric