In [2]:
import pandas as pd

general_tweets_corpus_train = pd.read_csv('general-tweets-train-tagged.csv', encoding='utf-8')
social_tweets_corpus_train = pd.read_csv('socialtv-tweets-train-tagged.csv', encoding='utf-8')
stompol_tweets_train_tagged = pd.read_csv('stompol-tweets-train-tagged.csv', encoding='utf-8')
tweets_corpus = pd.concat([
        general_tweets_corpus_train,
        social_tweets_corpus_train,
        stompol_tweets_train_tagged,
        #tweet_testeo,
    ])
tweets_corpus = tweets_corpus.drop('agreement',axis=1) #Eliminar columna agreement
tweets_corpus.sample(5)

Unnamed: 0,content,polarity
5805,"""Yo no soy la opinión pública ni la opinión pu...",P+
2778,Esta tarde solidaridad y apoyo para el futuro ...,P+
6177,Y este domingo abandono la veintena... 30 Taco...,P
1451,Que jefe es Ramos .,P
4168,A las 17.45 viene al estudio de #JELO Alicia ...,P+


In [3]:
tweets_corpus['content'] = tweets_corpus.content.fillna(' ') #Eliminar valores nan de la columna tweet
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('http://')]
tweets_corpus = tweets_corpus[-tweets_corpus.polarity.str.contains('NONE')]

In [4]:
#Funcion Limpieza de mensajes

######Limpieza General############
import re
from unicodedata import normalize

# -> NFD y eliminar diacríticos
#x = tweets_corpus.content.get_values()
#longitud=len(tweets_corpus.content.get_values())


def limpieza(x):
    for i in range(0,len(x)):
        x[i] = str(x[i])
        x[i]= re.sub(
                r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
                normalize( "NFD",x[i]), 0, re.I
            )
        x[i]=re.sub('(f|ht)tp(s?)://(.*)[.][a-z]+',' ',x[i])
        x[i]=re.sub('(f|ht)tp(s?)://(.*)[.][a-z]+',' ',x[i])
        x[i]=re.sub('[a-zA-Z0-9.?{}]+@\w+\.\w+.\w*','',x[i])
        x[i]=re.sub('\[[a-zA-Z0-9\,\. ]+\]','',x[i]) 
        x[i]=re.sub('\([a-zA-Z0-9\,\.\- ]+\)',' ',x[i])
        x[i]=re.sub('((et al\.)|(i\.i\.d\.)|(i\.e\.)|\-|\'|\’|\`)','',x[i])
        x[i]=re.sub('[^a-zA-Z_á\éíóúà\èìòùäëïöü\s]','',x[i])    
        x[i]=re.sub(' +',' ',x[i])    
        x[i]=re.sub('(a-z|A-Z){1,1}','',x[i].lower()) #Convertir a minuscula y quitar caracteres
    return x


In [5]:
tweets_corpus.content=limpieza(tweets_corpus.content.get_values())
tweets_corpus.head(5)

Unnamed: 0,content,polarity
1,pauladelasheras no te libraras de ayudar menos...,NEU
2,marodriguezb gracias mar,P
3,off pensando en el regalito sinde la que se va...,N+
4,conozco a alguien q es adicto al drama ja ja j...,P+
6,toca crackoviadetv grabacion dl especial navid...,P+


In [6]:
#Tokenizado y stemmer
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('spanish')    

def tokenizar(x):
    for i in range(0,len(x)): 
        #Tokenizar
        x[i] = x[i].split(' ')    
    return x

def stemming (x):
    for i in range(0,len(x)):   
        #Stemm
        x[i]=[stemmer.stem(elemento) for elemento in x[i]] 
    return x

def reconstruir_texto (x):
    for i in range(0,len(x)):   
        #Recuperacion de texto con ajustes
        x[i] = ' '.join(x[i])
    return x

        

In [7]:
tokenizado = tokenizar(tweets_corpus.content.get_values())
stemmizado = stemming(tokenizado)
tweets_corpus.content = reconstruir_texto(stemmizado)
tweets_corpus.head(5)

Unnamed: 0,content,polarity
1,pauladelasher no te libr de ayud men bes y graci,NEU
2,marodriguezb graci mar,P
3,off pens en el regalit sind la que se va de la...,N+
4,conozc a algui q es adict al dram ja ja ja te ...,P+
6,toc crackoviadetv grabacion dl especial navide...,P+


In [8]:
#Asginando clases binarias
tweets_corpus = tweets_corpus[tweets_corpus.polarity != 'NEU']
tweets_corpus['polarity_bin'] = 0
tweets_corpus.polarity_bin[tweets_corpus.polarity.isin(['P', 'P+'])] = 1
tweets_corpus.polarity_bin.value_counts(normalize=True)
tweets_corpus.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,content,polarity,polarity_bin
2,marodriguezb graci mar,P,1
3,off pens en el regalit sind la que se va de la...,N+,0
4,conozc a algui q es adict al dram ja ja ja te ...,P+,1
6,toc crackoviadetv grabacion dl especial navide...,P+,1
8,buen dia tod lo primer mand un abraz grand a m...,P+,1


In [9]:
#Validacion cruzada
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tweets_corpus, tweets_corpus.polarity_bin, test_size=0.01, random_state=0)

In [10]:
##Máquinas de soporte vectorial
import time
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')
from sklearn.externals import joblib


vectorizer = TfidfVectorizer(stop_words=spanish_stopwords)


train_vectors = vectorizer.fit_transform(X_train['content'])
# Guardar vectorizado
joblib.dump(vectorizer, 'vectorizado_binario.pkl')
#train_vectors = vectorizer.fit_transform(tweets_corpus['content']) #Entrenar con todos los datos
test_vectors = vectorizer.transform(X_test['content'])

# Perform classification with SVM, kernel=linear
classifier_SVM = svm.SVC(kernel='rbf', gamma =0.18,decision_function_shape='ovr',C=1.8)
t0 = time.time()
classifier_SVM.fit(train_vectors, y_train)
t1 = time.time()
prediction_linear = classifier_SVM.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
report = classification_report(y_test, prediction_linear, output_dict=True)
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
print('positive: ', report['1'])
print('negative: ', report['0'])

Training time: 2.475936s; Prediction time: 0.018984s
positive:  {'precision': 0.7241379310344828, 'recall': 0.8076923076923077, 'f1-score': 0.7636363636363636, 'support': 26}
negative:  {'precision': 0.782608695652174, 'recall': 0.6923076923076923, 'f1-score': 0.7346938775510203, 'support': 26}


In [11]:
##Guardar modelo entrenado
from sklearn.externals import joblib
# Output a pickle file for the model
joblib.dump(classifier_SVM, 'SVM_entrenado_sentimientos_binario.pkl') 

['SVM_entrenado_sentimientos_binario.pkl']

In [12]:
## Cargar SVM entrenado
from sklearn.externals import joblib
clf_load = joblib.load('SVM_entrenado_sentimientos_binario.pkl') 

In [13]:
##Probar clasificador cargado
t0 = time.time()
clf_load.fit(train_vectors, y_train)
t1 = time.time()
prediction_linear = clf_load.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
report = classification_report(y_test, prediction_linear, output_dict=True)
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
print('positive: ', report['1'])
print('negative: ', report['0'])

Training time: 2.619898s; Prediction time: 0.018533s
positive:  {'precision': 0.7241379310344828, 'recall': 0.8076923076923077, 'f1-score': 0.7636363636363636, 'support': 26}
negative:  {'precision': 0.782608695652174, 'recall': 0.6923076923076923, 'f1-score': 0.7346938775510203, 'support': 26}


In [14]:
###############KNN#############
from sklearn.neighbors import KNeighborsClassifier

#vectorizer = TfidfVectorizer(stop_words=spanish_stopwords)
#X = vectorizer.fit_transform(tweets_corpus.content)
X = train_vectors
#y_train=test_vectors
# Clustering the document with KNN classifier
modelknn = KNeighborsClassifier(n_neighbors=8)
modelknn.fit(X,y_train)


#Cargar textos para el analisis vector Test

#tweets_test_kNN['tweet'] = tweets_test_kNN['tweet'].fillna(' ')
Test = test_vectors
predicted_labels_knn = modelknn.predict(Test)

#tweets_test_kNN['polarity'] = predicted_labels_knn

# results
report = classification_report(y_test, predicted_labels_knn, output_dict=True)
print('positive: ', report['1'])
print('negative: ', report['0'])

positive:  {'precision': 0.6923076923076923, 'recall': 0.6923076923076923, 'f1-score': 0.6923076923076923, 'support': 26}
negative:  {'precision': 0.6923076923076923, 'recall': 0.6923076923076923, 'f1-score': 0.6923076923076923, 'support': 26}


In [15]:
##Guardar modelo entrenado
from sklearn.externals import joblib
# Output a pickle file for the model
joblib.dump(modelknn, 'KNN_entrenado_sentimientos_binario.pkl') 


['KNN_entrenado_sentimientos_binario.pkl']