<a href="https://colab.research.google.com/github/jmastrianni13/datalab-notebooks/blob/master/data-space/sankofa/dated/twitter_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Basic clean wrapper class

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin

class BasicClean(BaseEstimator, TransformerMixin):
    
    def __init__(self, X_column):
        super().__init__()
        
        self.X_column = X_column
        

    def fit(self, X, y = None):
        return self


    def transform(self, X, y = None):
        
        X_column = self.X_column
        
        df = X.copy()
   
        # Convert unicode to ascii
        import unicodedata

        df[X_column] = df[X_column].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode())    
    
        # Convert all text to lowercase
        df[X_column] = df[X_column].str.lower()
        
        # Correct spelling mistakes
        ''' need code to correct spelling mistakes '''
        
        #Replace repeating characters of 3+ repeats to 2 (ie 'bbbb' -> 'b')
        
        min_threshold_rep = 3
        
        df[X_column]= df[X_column].str.replace(r'(\w)\1{%d,}'%(min_threshold_rep-1), r'\1')
        
        # Remove noise:
        #    hyperlinks
        df[X_column] = df[X_column].str.replace(r"http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+", "", regex = True)
                
        #    punctuation - This needs to be updated so smilies are not affected
        import string
       
        df[X_column] = df[X_column].str.replace('[{}]'.format(string.punctuation), '')

        #Remove stop words
        ''' build in functionality to support custom wordbank stopwords and nltk stopwords '''
        from sklearn.feature_extraction import stop_words

    
        stop = stop_words.ENGLISH_STOP_WORDS
    
        df[X_column] = df[X_column].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))
        
        return df

Stemming wrapper class

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin

class LangStemmer(BaseEstimator, TransformerMixin):
    '''
        Convert words to their base form via stemming
    '''
    
    def __init__(self, X_column):
        super().__init__()

        self.X_column = X_column

    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        
        X_column = self.X_column
        
        df = X.copy()
           
        from nltk.stem import PorterStemmer
        
        ps = PorterStemmer()
    
        df[X_column] = df[X_column].apply(lambda x: ' '.join([ps.stem(word) for word in x.split()]))
        
        return df

Lemmatization wrapper class

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin

class LangLemmer(BaseEstimator, TransformerMixin):
    '''
        Convert words to their base form via lemmatization
    '''
    
    def __init__(self, X_column):
        super().__init__()
        
        self.X_column = X_column

    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        
        X_column = self.X_column
        
        df = X.copy()

        from nltk.stem import WordNetLemmatizer
        
        wnl = WordNetLemmatizer()
    
        df[X_column] = df[X_column].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in x.split()]))
       
        return df

Count Vectorizer wrapper class

Notes: Sometime the transform method returns a dataframe where the target column is itself a dataframe (instead of a series).  When this occurs, processing halts since we no longer have access to any series methods such value_counts()

This can be checked by returning the type on the dataframe's target column

Example of useable dataframe returned:
type(lemm_v_tweets.target)

Out[13]: pandas.core.series.Series

Example of unuseable dataframe returned:
type(lemm_v_tweets.target)

Out[19]: pandas.core.frame.DataFrame

Future updates - use fit_transform method in this wrapper