# Notebook para o PAN - Atribuição Autoral - 2018

In [1]:
%matplotlib inline
#python basic libs
from __future__ import print_function
import os;
from os.path import join as pathjoin;
import glob;
import json;
import codecs;
from collections import defaultdict;
import pprint;


#data analysis libs
import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt;
import random;

#machine learning libs
#feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#preprocessing and transformation
from sklearn.preprocessing import normalize, MaxAbsScaler, MinMaxScaler;
from sklearn.preprocessing import LabelBinarizer;
from sklearn.decomposition import PCA;
from sklearn.metrics.pairwise import cosine_similarity;

#classifiers
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE,SelectFpr,SelectPercentile, chi2;

#
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier;
from sklearn.ensemble import VotingClassifier

#model valuation
from sklearn.model_selection import train_test_split;
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score;

In [2]:
import platform; print(platform.platform())
print("NumPy", np.__version__)
import scipy; print("SciPy", scipy.__version__)
import sklearn; print("Scikit-Learn", sklearn.__version__)

Darwin-17.4.0-x86_64-i386-64bit
NumPy 1.14.0
SciPy 1.0.0
Scikit-Learn 0.19.1


### paths configuration

In [3]:
baseDir = '/Users/joseeleandrocustodio/Dropbox/mestrado/02 - Pesquisa/code';

inputDir= pathjoin(baseDir,'pan18aa');
outputDir= pathjoin(baseDir,'out',"oficial");
if not os.path.exists(outputDir):
    os.mkdir(outputDir);

In [4]:
pprinter = pprint.PrettyPrinter(indent=4)

## loading the dataset

In [5]:
def readCollectionsOfProblems(path):
    # Reading information about the collection
    infocollection = path+os.sep+'collection-info.json'
    with open(infocollection, 'r') as f:
        problems  = [
            {
                'problem': attrib['problem-name'],
                'language': attrib['language'],
                'encoding': attrib['encoding'],
            }
            for attrib in json.load(f)
            
        ]
    return problems;

In [6]:
problems = readCollectionsOfProblems(inputDir);

In [7]:
problems[0]

{'encoding': u'UTF-8', 'language': u'en', 'problem': u'problem00001'}

In [8]:
pd.DataFrame(problems).groupby(by='language').count()

Unnamed: 0_level_0,encoding,problem
language,Unnamed: 1_level_1,Unnamed: 2_level_1
en,2,2
fr,2,2
it,2,2
pl,2,2
sp,2,2


In [9]:
def readProblem(path, problem):
    # Reading information about the problem
    infoproblem = path+os.sep+problem+os.sep+'problem-info.json'
    candidates = []
    with open(infoproblem, 'r') as f:
        fj = json.load(f)
        unk_folder = fj['unknown-folder']
        for attrib in fj['candidate-authors']:
            candidates.append(attrib['author-name'])
    return unk_folder, candidates;

In [10]:
def read_files(path,label):
    # Reads all text files located in the 'path' and assigns them to 'label' class
    files = glob.glob(pathjoin(path,label,'*.txt'))
    texts=[]
    for i,v in enumerate(files):
        f=codecs.open(v,'r',encoding='utf-8')
        texts.append((f.read(),label, os.path.basename(v)))
        f.close()
    return texts

In [11]:
for index,problem in enumerate(problems):
    unk_folder, candidates_folder = readProblem(inputDir, problem['problem']); 
    problem['candidates_folder_count'] = len(candidates_folder);
    problem['candidates'] = [];
    for candidate in candidates_folder:
        problem['candidates'].extend(read_files(pathjoin(inputDir, problem['problem']),candidate));
    
    problem['unknown'] = read_files(pathjoin(inputDir, problem['problem']),unk_folder);    

In [12]:
pd.DataFrame(problems)

Unnamed: 0,candidates,candidates_folder_count,encoding,language,problem,unknown
0,"[(graceful ones.\n\n""One more,"" Marvelous said...",20,UTF-8,en,problem00001,"[(after all, his best friends. And what in the..."
1,"[(a mission.""\n\nJensen just raises an eyebrow...",5,UTF-8,en,problem00002,"[(“Potter was attractive,” Draco thought, sigh..."
2,[(qui l'avait tué mais tout était de la faute ...,20,UTF-8,fr,problem00003,[(son réveil. Sa main pulse et Draco frotte l'...
3,[(. Le canapé est vide et lorsqu'il passe deva...,5,UTF-8,fr,problem00004,"[(abasourdie.\n\nTout d'abord, elle crut que s..."
4,"[(Eppure lui la mappa l’aveva stampata, dannaz...",20,UTF-8,it,problem00005,[(– Oh. Cazzo.\nSirius era così sconvolto che ...
5,[(Yato ha trovato una lettera sul suo comodino...,5,UTF-8,it,problem00006,"[(così la tua vista, Moony?\n– Cercavo di esse..."
6,[(zmienił zdanie. Niech się stworzonko pobawi....,20,UTF-8,pl,problem00007,"[(dawniej pełna radości i ciepła, a teraz wiec..."
7,"[(Słowem, które Sherlock najczęściej słyszał w...",5,UTF-8,pl,problem00008,"[(, uderzającego o żebra niczym dzwon- niemal ..."
8,[(pero no lo ama como ama a Guignol –explicó e...,20,UTF-8,sp,problem00009,[(–La nariz puntiaguda del elfo casi rozaba el...
9,"[(incapaz de señalar un momento exacto, un pun...",5,UTF-8,sp,problem00010,[(tan parecidas hizo que su trasero latiese de...


In [13]:
#*******************************************************************************************************
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder


def eval_measures(gt, pred):
    """Compute macro-averaged F1-scores, macro-averaged precision, 
    macro-averaged recall, and micro-averaged accuracy according the ad hoc
    rules discussed at the top of this file.
    Parameters
    ----------
    gt : dict
        Ground truth, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    pred : dict
        Predicted attribution, where keys indicate text file names
        (e.g. `unknown00002.txt`), and values represent
        author labels (e.g. `candidate00003`)
    Returns
    -------
    f1 : float
        Macro-averaged F1-score
    precision : float
        Macro-averaged precision
    recall : float
        Macro-averaged recall
    accuracy : float
        Micro-averaged F1-score
    """

    actual_authors = list(gt.values())
    encoder = LabelEncoder().fit(['<UNK>'] + actual_authors)

    text_ids, gold_authors, silver_authors = [], [], []
    for text_id in sorted(gt):
        text_ids.append(text_id)
        gold_authors.append(gt[text_id])
        try:
            silver_authors.append(pred[text_id])
        except KeyError:
            # missing attributions get <UNK>:
            silver_authors.append('<UNK>')

    assert len(text_ids) == len(gold_authors)
    assert len(text_ids) == len(silver_authors)

    # replace non-existent silver authors with '<UNK>':
    silver_authors = [a if a in encoder.classes_ else '<UNK>' 
                      for a in silver_authors]

    gold_author_ints   = encoder.transform(gold_authors)
    silver_author_ints = encoder.transform(silver_authors)

    # get F1 for individual classes (and suppress warnings):
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        f1 = f1_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        precision = precision_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        recall = recall_score(gold_author_ints,
                  silver_author_ints,
                  labels=list(set(gold_author_ints)),
                  average='macro')
        accuracy = accuracy_score(gold_author_ints,
                  silver_author_ints)

    return f1,precision,recall,accuracy


In [14]:
def evaluate(ground_truth_file,predictions_file):
    # Calculates evaluation measures for a single attribution problem
    gt = {}
    with open(ground_truth_file, 'r') as f:
        for attrib in json.load(f)['ground_truth']:
            gt[attrib['unknown-text']] = attrib['true-author']

    pred = {}
    with open(predictions_file, 'r') as f:
        for attrib in json.load(f):
            if attrib['unknown-text'] not in pred:
                pred[attrib['unknown-text']] = attrib['predicted-author']
    f1,precision,recall,accuracy =  eval_measures(gt,pred)
    return f1, precision, recall, accuracy

In [15]:
from sklearn.tree     import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC;
def runML(problem):
    print ("Problem: %s,  language: %s, " %(problem['problem'],problem['language']))
    
    #creating author profile
#     profile = defaultdict(unicode);
#     for text, author, _ in problem['candidates']:
#         profile[author]+=text;
    
#     train_docs   = list(profile.values())
#     train_labels = list(profile.keys())
    
    train_docs, train_labels, _  = zip(*problem['candidates'])
    
    problem['training_docs_size'] = len(train_docs);
    
    test_docs, _, test_filename = zip(*problem['unknown'])
    
    
    #feature extraction
    vectorizer = TfidfVectorizer(analyzer="char",min_df=0.3,max_df=1.0, lowercase=False, ngram_range=(3,5));
    train_mx   = vectorizer.fit_transform(train_docs);
    test_max   = vectorizer.transform(test_docs);
    
    scaler = MaxAbsScaler();
    train_mx = scaler.fit_transform(train_mx);
    test_max = scaler.transform(test_max);
    
    selector = RFE(estimator=SVC(kernel="linear", C=1), step=0.05, verbose=False)
    
    train_mx = selector.fit_transform(train_mx,train_labels);
    test_max = selector.transform(test_max);
    
    #machine learning
    clf = LogisticRegression(random_state=0,multi_class='multinomial', solver='newton-cg', C=0.5);
    #clf = AdaBoostClassifier(clf, n_estimators=50, learning_rate=0.01)
    #clf =  BaggingClassifier(clf, n_estimators=10, random_state=42)
    #clf = VotingClassifier([clf1,clf2] , voting='hard')
    clf.fit(train_mx,train_labels);
    
    train_pred=clf.predict(train_mx);
    test_pred=clf.predict(test_max);
    
    # Writing output file
    out_data=[]
    for i,v in enumerate(test_pred):
        out_data.append(
                {'unknown-text': test_filename[i],
                 'predicted-author': v
                }
                )
    answerFile = pathjoin(outputDir,'answers-'+problem['problem']+'.json');
    with open(answerFile, 'w') as f:
        json.dump(out_data, f, indent=4)
        #allProblems.extend(out_data)
    
    
    #evaluation train
    f1,precision,recall,accuracy=evaluate(
                pathjoin(inputDir, problem['problem'], 'ground-truth.json'),
                answerFile)
#     f1,precision,recall,accuracy=eval_measures(
#             {str(i):label for i,label in enumerate(train_labels)},
#             {str(i):label for i,label in enumerate(train_pred)}
#     )
    return {
                'problem-name'   : problem['problem'],
                "train_doc_size":len(train_docs),
                "language":problem['language'],
                'macro-f1'       : round(f1,3),
                'macro-precision': round(precision,3),
                'macro-recall'   : round(recall,3),
                'micro-accuracy' : round(accuracy,3),
                'AuthorCount':len(set(train_labels))
        };
    
    #evaluate_all(inputDir,outputDir, outputDir, instanceName)
    
    

In [None]:
result = [];
for problem in problems:
    result.append(runML(problem));
pd.DataFrame(result)

Problem: problem00001,  language: en, 
Problem: problem00002,  language: en, 
Problem: problem00003,  language: fr, 
Problem: problem00004,  language: fr, 
Problem: problem00005,  language: it, 
Problem: problem00006,  language: it, 
Problem: problem00007,  language: pl, 


In [None]:
pd.DataFrame(result)[['macro-f1']].describe()

In [None]:
pd.DataFrame(result)\
    .sort_values(by=['language','problem-name'])[['language','problem-name','macro-f1']]\
    .plot(kind='bar', x=['language','problem-name'], legend=True, figsize=(20,5))

<br/><br/><br/><br/><br/>

#  Abordagem desafiante 1

In [None]:
from gensim.models import Word2Vec;

In [None]:
class NgramSplitter(object):
    def __init__(self, text, ngram=(3,3), vocabulary=None):
        self.text = text
        self.ngram_min = ngram[0]
        self.ngram_max = ngram[1];
        self.vocabulary = vocabulary;
    
    def text2ngrams(self,text):
        vect = [
            text[t:t+j]
                for t in xrange(len(text)-self.ngram_max+1)
                for j in xrange(self.ngram_min, self.ngram_max+1)
        ]
        
        if self.vocabulary is not None:
            return [word for word in vect if word in self.vocabulary];
        else:
            return [word for word in vect if word]
 
    def __iter__(self):
        if isinstance(self.text,list):
            for s in self.text:
                yield self.text2ngrams(s);
        elif isinstance(self.text,str) or isinstance(self.text,unicode):
            yield self.text2ngrams(self.text);

In [None]:
def simpleCosine(a, b):
    '''
    calculates cosine between array a and b.
    This function is used because sklearn similiraty function compares all elements vs all elements
    what will not be used. So this function becames handy.
    '''
    a = a / np.sqrt(np.sum(a **2));
    b = b / np.sqrt(np.sum(b **2));
    cos = np.sum(np.array(a) * np.array(b));
    return cos;

###  parameters

In [None]:
ngram = (3,5);
embeddingSize  = 50
problem = problems[8];
print ("Problem: %s,  language: %s, " %(problem['problem'],problem['language']))

the authors profile is used to generate author word vectors

In [None]:
#creating author profile
profile = defaultdict(unicode);
for text, author, _ in problem['candidates']:
    profile[author]+=text;

Data sets

In [None]:
profile_docs   = list(profile.values())
profile_labels = list(profile.keys())

In [None]:
train_docs, train_labels,_ = zip(*problem['candidates']);

In [None]:
#code from baseline
gt = {}
with open(pathjoin(inputDir, problem['problem'], 'ground-truth.json'), 'r') as f:
    for attrib in json.load(f)['ground_truth']:
        gt[attrib['unknown-text']] = attrib['true-author']

In [None]:
test_docs, _, test_filename = zip(*problem['unknown'])
test_labels = [gt[v] for v in test_filename]

Using count vectorizer to create a fixed vocabulary

In [None]:
vectorizer = CountVectorizer(analyzer="char", ngram_range=ngram, min_df=0.4, max_df=1.0, lowercase=False)
counts = vectorizer.fit_transform(profile_docs);

In [None]:
vocabulary = dict(zip(
    vectorizer.vocabulary_,
    np.array(counts.sum(axis=0)).flatten()
)
)
len(vectorizer.vocabulary_)

In [None]:
#transform a document into a word vector using CBOW
def doc2vectors(doc):
    model = Word2Vec(
        NgramSplitter(doc,ngram=ngram, vocabulary=vectorizer.vocabulary_),
        sg=0,
        min_count=2,
        size=embeddingSize,
        seed=0
    );
    return model.wv;

### profile vector represent each author in the embedding space

In [None]:
profileVectors = {author: doc2vectors(profile[author]) for author in profile};

#### testing author vector against internal documents

In [None]:
documentVectors =[doc2vectors(doc) for doc in train_docs];

In [None]:
def compare(profileVectors, doc):
    vocabDoc = set(doc.vocab.keys());
    
    metrics = [];
    
    for author in profileVectors:
        authorVocab = set(profileVectors[author].vocab.keys());
        intersect = vocabDoc & authorVocab;
        union = len(vocabDoc | authorVocab);
        jaccard = 1.0*len(intersect) / union;
        
        cosine = [
            simpleCosine(doc[word],profileVectors[author][word])
            for word in intersect
        ];
            
        metrics.append({
            'candidate':author,
            'jaccard'  :jaccard,
            'lenIntersect':len(intersect),
            'lenUnion'    :union,
            'lenMax': max(len(authorVocab), len(vocabDoc)),
            'distanceVector'    :np.sum(cosine)            
        })
    #softmax norm
    cosine = np.array([c['distanceVector'] for c in metrics ]);
    #minMax
    cosine = (cosine - np.min(cosine))/(np.max(cosine) - np.min(cosine));
    cosine = np.exp(cosine)/np.sum(np.exp(cosine));
    
    #appending normalized sum of distance
    for i,c in enumerate(metrics):
        c.update({'distanceVectorNorm': cosine[i]})
    
    return metrics; 

##### testing one document

In [None]:
train_doc_index = 4;
print ("Expected answer %s" % train_labels[train_doc_index])
metrics = compare(profileVectors, documentVectors[train_doc_index]);

In [None]:
pd.DataFrame(metrics).sort_values(by='distanceVectorNorm', ascending=False).head()

In [None]:
def predict(profileVectors, documentVectors):
    predictions = [];
    
    for document in documentVectors:
        metrics = compare(profileVectors, document);
        metrics.sort(key=lambda k: (k['distanceVectorNorm'])/1.0, reverse=True  )
        predictions.append(metrics[0]['candidate']);
    return predictions;

In [None]:
pred =predict(profileVectors, documentVectors)
zip(train_labels, pred)

### Teste

In [None]:
documentVectorsTest =[doc2vectors(doc) for doc in test_docs];

analyzing one incorrect answer

In [None]:
test_index = 74
print ("Expected answer (candidate with highest score) %s"% test_labels[test_index])
df= pd.DataFrame(compare(profileVectors, documentVectorsTest[test_index]))\
    .sort_values(by='distanceVectorNorm',ascending=False)\
    .head(35) \
    .reset_index();
df

In [None]:
prob = []
documentVectorsTestMetrics = [];
for t,truth,instance in zip(documentVectorsTest,test_labels, test_filename):
    for i,j in enumerate(sorted(compare(profileVectors, t), key=lambda x: x['distanceVector'], reverse=True)):
        j.update({"truth":truth, 'instance':instance, 'distanceToTruth':i});
        
        jcopy = j.copy();
        jcopy.update({'correct':truth == j['candidate']})
        documentVectorsTestMetrics.append(jcopy)
        
        if truth != j['candidate']:
            continue;
        
        prob.append(j);

In [None]:
df = pd.DataFrame(prob)[
    ['instance','distanceToTruth','truth','distanceVector','distanceVectorNorm','jaccard','lenIntersect','lenMax','lenUnion']
].sort_values(by=['instance','truth','distanceVector'], ascending=False)
df

In [None]:
df = pd.DataFrame(prob).sort_values(by=['distanceVector','jaccard'], ascending=False)
df.plot.scatter(x='distanceToTruth', y='distanceVector')

In [None]:
df = pd.DataFrame(prob)
#df.distanceToTruth = np.log(df.distanceToTruth+1)
#df.distanceToTruth = df.distanceToTruth/df.distanceToTruth.max()
df.plot.scatter(
    x='distanceVectorNorm',
    y='jaccard',c='distanceToTruth',
    #figsize=(20,5),
    #xlim=(0,0.25), ylim=(0,0.25),
    cmap='viridis_r')

In [None]:
df = pd.DataFrame(prob).sort_values(by=['instance','truth','distanceVector'], ascending=False)
df = df.groupby(by='distanceToTruth').size().reset_index();
df.columns=['distanceToTruth', 'counter']
df['cumm'] = df['counter'].cumsum();
df

In [None]:
df.plot.line(x='distanceToTruth',marker='.');

In [None]:
predTest =predict(profileVectors, documentVectorsTest)

In [None]:
df = pd.DataFrame(zip(test_labels, predTest), columns=['Truth', 'Pred'])
df['comp'] = df.Truth == df.Pred
df.groupby(by='comp').count()

In [None]:
f1,precision,recall,accuracy =  eval_measures(gt,{k: v for k,v in zip(test_filename, predTest)  })

In [None]:
pd.DataFrame([{
                'macro-f1'       : round(f1,3),
                'macro-precision': round(precision,3),
                'macro-recall'   : round(recall,3),
                'micro-accuracy' : round(accuracy,3)
             }])

In [None]:
df = pd.DataFrame(documentVectorsTestMetrics)
len(df[df.correct & (df.distanceToTruth==0)].truth.unique())

# Abordagem 2

In [None]:
embeddingSize  = 100

In [None]:
class AuthorNgram(object):
    def __init__(self, text, label, vocabulary=None):
        self.text = text;
        self.label = label;
        self.ngram_min = 3;
        self.ngram_max = 5;
        self.vocabulary = vocabulary;
        self.window = 3;
    
    def text2ngrams(self,text):
        vect = [
            text[t:t+j]
                for t in xrange(len(text)-self.ngram_max+1)
                for j in xrange(self.ngram_min, self.ngram_max+1)
        ]
        
        if self.vocabulary is not None:
            return [word for word in vect if word in self.vocabulary];
        else:
            return [word for word in vect if word]
        
    def textPlusAuthor(self,text,label):
        vect = self.text2ngrams(text);
        vect = [
                vect[t:t+self.window] +[label]+ vect[t+self.window:t+self.window*2] 
                for t in xrange(len(vect)-2*self.window+1)
        ];
        return vect;
    
    def __len__(self):
        l = 0;
        for i in range (self.ngram_min, self.ngram_max +1):
            l += len(self.text) - i
        return l
 
    def __iter__(self):
        if isinstance(self.text,list):
            for s in self.text:
                for vect in self.textPlusAuthor(s, self.label):
                    yield vect;
        elif isinstance(self.text,str) or isinstance(self.text,unicode):
                for vect in self.textPlusAuthor(self.text, self.label):
                    yield vect;

In [None]:
for s in AuthorNgram(['a casa caiu'],"_text_"):
    print (s);

In [None]:

for i,s in enumerate(AuthorNgram(train_docs[0],"_text_")):
    print (s);
    if i == 20:
        break;

In [None]:
def getEmbedding(doc, label):
    model = Word2Vec(
        sg=0,
        min_count=2,
        size=embeddingSize,
        seed=0
    );
    corpus_count = len(AuthorNgram(doc,label, vocabulary=vocabulary.keys()))
    model.build_vocab_from_freq(vocabulary)
    model.train(
        AuthorNgram(doc,label, vocabulary=vocabulary.keys()),
        total_examples=corpus_count,
        epochs=10
    )
    
    
    return model.wv[label]

In [None]:
authorEmbedding = {
    author:getEmbedding(profile[author], author)
    for author in profile
    
};

In [None]:
authorVectors= np.vstack(authorEmbedding.values())

In [None]:
trainVectors = [getEmbedding(doc, "_train"+str(i)+"_") for i,doc in enumerate(train_docs)]
testVectors  = [getEmbedding(doc, "_test"+str(i)+"_") for i,doc in enumerate(test_docs)]

In [None]:
trainVectors =np.vstack(trainVectors)
testVectors  =np.vstack(testVectors)

In [None]:
trainPred = cosine_similarity(normalize(trainVectors,'l2'), normalize(authorVectors,'l2'))
testPred  = cosine_similarity(normalize(testVectors,'l2'), normalize(authorVectors,'l2'))

In [None]:
def cosineToLabels(matrix, labels):
    #min_ = matrix.min(axis=1, keepdims=True)
    #max_ = matrix.max(axis=1, keepdims=True)
    #matrix = (matrix - min_)/(max_ - min_);
    length = len(labels);
    labels = np.array(labels);
    temp = (length-1-np.argsort(matrix, axis=1))==0
    
    return [labels[i][0] for i in temp ];

In [None]:
trainPredLabels = cosineToLabels(trainPred,authorEmbedding.keys())
testPredLabels = cosineToLabels(testPred,authorEmbedding.keys())

In [None]:
np.sum(np.array(train_labels) == np.array(trainPredLabels))

In [None]:
trainPred.min(axis=1, keepdims=True)

In [None]:
zip(train_labels, trainPredLabels)

# Abordagem 2 - por LSA

In [None]:
from sklearn.decomposition import TruncatedSVD;
from scipy.sparse.linalg import svds

In [None]:
class LSAWordVec:
    def __init__(self):

In [None]:
lsaVectorizer = TfidfVectorizer(analyzer="char", ngram_range=ngram, min_df=0.4, max_df=1.0, lowercase=False)
profileLSAVectors = lsaVectorizer.fit_transform([profile[author] for author in profile]);

In [None]:
profileLabels =[author for author in profile]

In [None]:
np.shape(profileLSAVectors)

In [None]:
svd = TruncatedSVD(n_components=20, random_state=42, n_iter=10)

In [None]:
profileLSAVectors.shape

In [None]:
svdProfile = svd.fit_transform(profileLSAVectors)

In [None]:
np.shape(svdProfile)

In [None]:
documentVectorsLSA = svd.transform(lsaVectorizer.transform(train_docs));
documentVectorsTest = svd.transform(lsaVectorizer.transform(test_docs));

In [None]:
def compareLSA(svdProfile, doc):
    metrics = [];
    
    for author in profileVectors:
        authorVocab = (profileVectors[author] >0).sum();
        intersect = (profileVectors[author] >0) & (doc > 0)>0;
        union = len(vocabDoc | authorVocab);
        jaccard = 1.0*len(intersect) / union;
        
        cosine = [
            simpleCosine(doc[word],profileVectors[author][word])
            for word in intersect
        ];
            
        metrics.append({
            'candidate':author,
            'jaccard'  :jaccard,
            'lenIntersect':len(intersect),
            'lenUnion'    :union,
            'lenMax': max(len(authorVocab), len(vocabDoc)),
            'distanceVector'    :np.sum(cosine)            
        })
    #softmax norm
    cosine = np.array([c['distanceVector'] for c in metrics ]);
    #minMax
    cosine = (cosine - np.min(cosine))/(np.max(cosine) - np.min(cosine));
    cosine = np.exp(cosine)/np.sum(np.exp(cosine));
    
    #appending normalized sum of distance
    for i,c in enumerate(metrics):
        c.update({'distanceVectorNorm': cosine[i]})
    
    return metrics; 

In [None]:
def predictLSA(svdProfile, documentVectors):
    cosines = cosine_similarity(documentVectors,svdProfile)
    sortedIndex = np.argsort(svdTrainPred, axis=1);
    
    labels = np.array(profileLabels);
    
    
    
    predictions = [];
    
    for i,cosine_ in enumerate(cosines):
        
        l       = labels[sortedIndex];
        cosine_ = cosine_[i][sortedIndex[i]];
        
        metrics= [{
            'candidate':a,
            'distanceToTruth':len(labels) - ii,
            'distanceVector' :c            
        }  for a,c, ii in zip(l, cosine_, sortedIndex[i])]
        
        metrics.sort(key=lambda k: (k['distanceToTruth'])/1.0, reverse=True  )
        predictions.append(metrics[0]['candidate']);
    return predictions;

In [None]:
svdTrainPred = cosine_similarity(documentVectorsLSA,svdProfile)

In [None]:
predictLSA(svdProfile,documentVectorsLSA )

In [None]:
np.reshape(np.round(np.random.rand(30)*100), (10,3))

In [None]:
predTestLSA = [np.array(profileLabels)[v][0] for v in  (np.argsort(svdTrainPred, axis=1)==19)]

In [None]:
f1,precision,recall,accuracy =  eval_measures(gt,{k: v for k,v in zip(test_filename, predTestLSA)  })
pd.DataFrame([{
                'macro-f1'       : round(f1,3),
                'macro-precision': round(precision,3),
                'macro-recall'   : round(recall,3),
                'micro-accuracy' : round(accuracy,3)
             }])

In [None]:
svdTrainPred[1]

In [None]:
np.argsort(svdTrainPred, axis=1)[1]

In [None]:
t = [list('casa sapato'),list('terra casa, sapato')]