En este programa vamos a lanzar todos los modelos estadisticos y luego los vamos a evaluar conjuntamente.
Vamos a lanzar varios modelos:
    - Maximum Entropy
    - SVM
    - Logistic Regresion
    - MultinomialNB
    - SGDClassifier
    - BernoulliNB
    - Naive Bayes

Y dentro de estos modelos, vamos a lanzarlos con la totalidad de los daots y luego con validacion cruzada de varios tamaños para ver cual es el modelo óptimo.
    
    

In [4]:
#Realizamos las importaciones de los paquetes que vamos a utilizar
import collections
import nltk.classify.util,nltk.metrics
from nltk.classify import NaiveBayesClassifier, MaxentClassifier, SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
import csv
from nltk import word_tokenize
from sklearn.svm import LinearSVC, SVC, NuSVR, OneClassSVM
from sklearn.linear_model import LogisticRegression, SGDClassifier
import random
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures



In [5]:
#Leemos los ficheros con los datos tanto positivos como negativos
posdata = []
try:
    with open('DATA_SALIDA/positivos.txt', 'rb') as myfile:    
        reader = csv.reader(myfile, delimiter=',')
        for val in reader:
            posdata.append(val[0])
except:
    pass


In [6]:
#Los negativos
negdata = []
try:
    with open('DATA_SALIDA/negativos.csv', 'rb') as myfile:    
        reader = csv.reader(myfile, delimiter=',')
        for val in reader:
            negdata.append(val[0])            
except:
    pass

In [None]:
 #Definimos unas cuantas funciones que vamos a ir usando

#Esta es para partir las palabras
def word_split(data):    
    data_new = []
    for word in data:
        word_filter = [i.lower() for i in word.split()]
        data_new.append(word_filter)
    return data_new

def word_feats(words):    
    return dict([(word, True) for word in words])

#Creamos un diccionario con las stopwords en castellano
stopset = set(stopwords.words('spanish'))

#CReamos otrodiccionario con las palabras del texto pero eliminando las stopwords
def stopword_filtered_word_feats(words):
    #return dict([(word, True) for word in words if word not in stopset])
    return dict([(word, True) for word in words if word not in stopset])
    
#CReamos bi-gramas con las palabras del texto
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)

    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

#Creamos bigramas con las palabras del texto pero eliminando las stopwords
def bigram_word_feats_stopwords(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams) if ngram not in stopset])
 

In [32]:
# Calculamos distintos parametros para los modelos

def evaluate_classifier(featx):
    
    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    #Cortamos los ficheros
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
    #Creamos los ficheros de train y test
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    
    # vamos a realizar 7 modelos
    lista_modelos = ['nb', 'bnb', 'mnb', 'maxent', 'svm', 'glm', 'sgd']     
        
    for cl in lista_modelos:
        if cl == 'maxent':
            classifierName = 'Maximum Entropy'
            classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter = 1)
        elif cl == 'svm':
            classifierName = 'SVM'
            classifier = SklearnClassifier(LinearSVC(), sparse=False)
            classifier.train(trainfeats)
        elif cl == 'glm':
            classifierName = 'Logistic Regresion'
            classifier = SklearnClassifier(LogisticRegression())
            classifier.train(trainfeats)
        elif cl == 'mnb':
            classifierName = 'MultinomialNB'
            classifier = SklearnClassifier(MultinomialNB())
            classifier.train(trainfeats)
        elif cl == 'sgd':
            classifierName = 'SGDClassifier'
            classifier = SklearnClassifier(SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False, n_iter=None))
            classifier.train(trainfeats)
        elif cl == 'bnb':
            classifierName = 'BernoulliNB'
            classifier = SklearnClassifier(BernoulliNB())
            classifier.train(trainfeats)
        else:
            classifierName = 'Naive Bayes'
            classifier = NaiveBayesClassifier.train(trainfeats)
            #palabrasmasimportatesNB = classifier.show_most_informative_features(10)
            
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set)
 
        for i, (feats, label) in enumerate(testfeats):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)
 
        #Calculamos los estadisticos a mostrar: accuracy, precision, recall y f-measure
        accuracy = nltk.classify.util.accuracy(classifier, testfeats)
        pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
        pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
        pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
        neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
        neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
        neg_fmeasure =  nltk.f_measure(refsets['neg'], testsets['neg'])
        
        #Mostramos los resultados
        print ''
        print '---------------------------------------'
        print 'RESULTADO INDIVIDUAL ' + '(' + classifierName + ')'
        print '---------------------------------------'
        print 'accuracy:', accuracy
        print 'precision:', (pos_precision + neg_precision) / 2
        print 'recall:', (pos_recall + neg_recall) / 2
        print 'f-measure:', (pos_fmeasure + neg_fmeasure) / 2   


In [7]:
## Los mismos parametros pero con validacion cruzada de N partes
def evaluate_classifier_cross_val(featx, npartes):   

    negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
    posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
    #Cortamos los ficheros
    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
    #Creamos los ficheros de train y test 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
    
    trainfeats = negfeats + posfeats    
    
    # Creamos un dataset aleatorio  
    random.shuffle(trainfeats)    
    n = npartes # Le pasamos el numero de partes por parámetro    
    
    lista_modelos = ['nb', 'bnb', 'mnb', 'maxent', 'svm', 'glm', 'sgd']  
    
    for cl in lista_modelos:
        #Creamos unas listas vacias para ir rellenando cada vez que se ejecute un submodelo de la validacion cruzada
        subset_size = len(trainfeats) / n
        accuracy = []
        pos_precision = []
        pos_recall = []
        neg_precision = []
        neg_recall = []
        pos_fmeasure = []
        neg_fmeasure = []
        cv_count = 1
        for i in range(n):        
            testing_this_round = trainfeats[i*subset_size:][:subset_size]
            training_this_round = trainfeats[:i*subset_size] + trainfeats[(i+1)*subset_size:]
            if cl == 'maxent':
                classifierName = 'Maximum Entropy'
                classifier = MaxentClassifier.train(trainfeats, 'GIS', trace=0, encoding=None, labels=None, gaussian_prior_sigma=0, max_iter = 1)
            elif cl == 'svm':
                classifierName = 'SVM'
                classifier = SklearnClassifier(LinearSVC(), sparse=False)
                classifier.train(trainfeats)
            elif cl == 'glm':
                classifierName = 'Logistic Regresion'
                classifier = SklearnClassifier(LogisticRegression())
                classifier.train(trainfeats) 
            elif cl == 'sgd':
                classifierName = 'SGDClassifier'
                classifier = SklearnClassifier(SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False, n_iter=None))
                classifier.train(trainfeats)                        
            elif cl == 'mnb':
                classifierName = 'MultinomialNB'
                classifier = SklearnClassifier(MultinomialNB())
                classifier.train(trainfeats)            
            elif cl == 'bnb':
                classifierName = 'BernoulliNB'
                classifier = SklearnClassifier(BernoulliNB())
                classifier.train(trainfeats)              
            else:
                classifierName = 'Naive Bayes'
                classifier = NaiveBayesClassifier.train(trainfeats)
                #palabrasmasimportatesNB = classifier.show_most_informative_features(10)
                    
            refsets = collections.defaultdict(set)
            testsets = collections.defaultdict(set)
            for i, (feats, label) in enumerate(testing_this_round):
                refsets[label].add(i)
                observed = classifier.classify(feats)
                testsets[observed].add(i)
            
            #Calculamos los estadisticos a mostrar: accuracy, precision, recall y f-measure
            cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round)
            cv_pos_precision = nltk.precision(refsets['pos'], testsets['pos'])
            cv_pos_recall = nltk.recall(refsets['pos'], testsets['pos'])
            cv_pos_fmeasure = nltk.f_measure(refsets['pos'], testsets['pos'])
            cv_neg_precision = nltk.precision(refsets['neg'], testsets['neg'])
            cv_neg_recall = nltk.recall(refsets['neg'], testsets['neg'])
            cv_neg_fmeasure =  nltk.f_measure(refsets['neg'], testsets['neg'])

            accuracy.append(cv_accuracy)
            pos_precision.append(cv_pos_precision)
            pos_recall.append(cv_pos_recall)
            neg_precision.append(cv_neg_precision)
            neg_recall.append(cv_neg_recall)
            pos_fmeasure.append(cv_pos_fmeasure)
            neg_fmeasure.append(cv_neg_fmeasure)
            
            cv_count += 1
        
        #Mostramos los resultados                
        print '---------------------------------------'
        print 'RESULTADO DE VALIDACIÓN CRUZADA CON ' + str(n) + ' partes: ' + '(' + classifierName + ')'
        print '---------------------------------------'
        print 'accuracy:', sum(accuracy) / n
        print 'precision:', (sum(pos_precision)/n + sum(neg_precision)/n) / 2
        print 'recall:', (sum(pos_recall)/n + sum(neg_recall)/n) / 2
        print 'f-measure:', (sum(pos_fmeasure)/n + sum(neg_fmeasure)/n) / 2
        

In [33]:
#Ejecutamos y evaluamos todos los modelos
evaluate_classifier(bigram_word_feats)    

In [None]:
#Ejecutamos y evaluamos todos los modelos con validacion cruzada
evaluate_classifier_cross_val(bigram_word_feats, 10)

In [34]:
#Ejecutamos y evaluamos el modelo filtrando las stop words
evaluate_classifier(bigram_word_feats_stopwords)

In [None]:
#Ejecutamos y evaluamos el modelo filtrando las stop words con validacion cruzada
evaluate_classifier_cross_val(bigram_word_feats_stopwords, 10)