In [31]:
import numpy as np
import pandas as pd
import time
from nltk import *
from stop_words import get_stop_words

# import string
from sklearn import feature_extraction

In [32]:
class spanish_tweet_tokenizer_with_date:
    """Recibe ts o df con dos columnas: fecha en formato twitter (como índice) y el texto de los tweets"""
    """Receives ts or df with two columns: date in twitter format (as index) and tweets text."""
    def __init__(self,tweets):
        # Asumo que la información no tiene estorbos (nans, cosas distintas a fechas en el índice...)
        # I assume info hasn't any hindrance
        try:
            tweets.stack()
        except:
            pass
        try:
            date = pd.to_datetime([time.strftime('%Y-%m-%d %H:%M:%S', j) 
                                   for j in [time.strptime(i,'%a %b %d %H:%M:%S +0000 %Y') for i in tweets.index]])
        except:
            date = pd.to_datetime(tweets.index)
        tokenizer = tokenize.simple.SpaceTokenizer() #para evitar problemas con acentuación
        self.date_corrected = pd.Series(tweets.values,
                                        index=date)
        
        tweets = tweets.str.lower()
        tokens = []
        dates  = []
        tknd_tweets = [] #aquí almacenaré tweets ya tokenizados
        try:
            decoded_tweets = [x.decode('utf-8') for x in tweets.astype('string')]
        except IndexError:
            decoded_tweets = [x.decode('utf-8') for x in tweets.iloc[:,1].astype('string')]
        for t in xrange(len(decoded_tweets)):
            tokened = tokenizer.tokenize(decoded_tweets[t])
            tknd_tweets.append(tokened)
            tweet_date = date[t]
            for word in xrange(len(tokened)):
                dates.append(tweet_date)
                if len(tokened[word])<=1:
                    tokens.append(tokened[word])
                elif tokened[word][-1] in [sign.decode('utf-8') for sign in ['?','!',',',':','.','-','"',')']]:
                    tokens.append(tokened[word][:-1])
                elif tokened[word][0] in [sign.decode('utf-8') for sign in ['¿','¡',',',':','.','-','"', '(']]:
                    tokens.append(tokened[word][1:])
                else:
                    tokens.append(tokened[word].lower())
        frame = pd.DataFrame(np.asarray(tokens).reshape(len(tokens)),
                             columns=['tokens'],
                             index=pd.to_datetime(dates))
        self.tokened_tweets_df = frame
        self.tokened_tweets_ts = frame.stack()
        self.tokened_tweets = frame.values
        self.n_tokens = frame.shape[0]
        self.n_tweets = tweets.shape[0]
        
        # Agregaré en automático el cómputo sin stop_words
        stop_words_sp = get_stop_words('spanish')
        #algunas palabras que no incluye
        stop_words_sp.append(' ')
        stop_words_sp.append('')
        stop_words_sp.append('-')
        stop_words_sp.append(u'si')
        stop_words_sp.append(u'sí')
        ix_no_sw    = np.in1d(self.tokened_tweets, stop_words_sp)!=1 #stopwords
        frame_no_sw = self.tokened_tweets_ts[ix_no_sw]

        self.tokened_tweets_no_sw = frame_no_sw
        self.n_tokens_no_sw       = frame_no_sw.shape[0]
    
    def n_tokens(self):
        return self.n_words
    
    def n_tweets(self):
        return self.n_tweets
    
    def lexical_diversity(self):
        distinct_words = len(set([word for word in self.tokened_tweets_ts]))
        total_words    = self.tokened_tweets.shape[0]
        
        lex_div = 1.*distinct_words/total_words
        return lex_div
    
    def avg_words_p_tweet(self):
        avg_words = 1.*self.n_tokens/self.n_tweets
        return avg_words
    
    def tokened_tweets_no_sw(self):
        return self.tokened_tweets_no_sw
        
    def n_tokens_no_sw(self):
        return self.n_tokens_no_sw
    
    def lexical_diversity_no_sw(self):
        distinct_words = len(set([word for word in self.tokened_tweets_no_sw]))
        total_words    = self.n_tokens_no_sw

        lex_div = 1.*distinct_words/total_words
        return lex_div

    def avg_words_p_tweet_no_sw(self):
        avg_words = 1.*self.n_tokens_no_sw/self.n_tweets
        return avg_words
    
    def lookfor(self, words_to_look, include_sw):
        """Returns a filtered ts that includes only words_to_look, over ts with or without stopwords"""
        if include_sw==True:
            frame = self.tokened_tweets_no_sw
            n_obs = self.n_tokens_no_sw
        else:
            frame = self.tokened_tweets_ts
            n_obs = self.n_tokens
        
        temp_ix = np.ones((1,n_obs))
        
        for w in words_to_look:
            temp_ix = np.concatenate((temp_ix,frame.str.match(w).values.reshape(1,n_obs)),
                                     axis=0)
        ix    = pd.DataFrame(temp_ix[1:,:])
        
        final = frame.loc[(ix.mean()>0).values]
        
        return final
    def lookfor_intweets(self, words_to_look):
        """Retrieves tweets that contain any of the given words."""
        frame = self.date_corrected.str.lower()
        n_obs = frame.shape[0]
        
        temp_ix = np.ones((1,n_obs))
        
        for w in words_to_look:
            temp_ix = np.concatenate((temp_ix,frame.str.contains(w).values.reshape(1,n_obs)),
                                     axis=0)
        ix    = pd.DataFrame(temp_ix[1:,:])
        final = frame.loc[(ix.sum()>0).values]
        
        return final