In [3]:
import pandas as pd
import numpy as np
import tweepy
import pickle
import json
import string
import re
from pymongo import MongoClient
# from nlp_pipeline import nlp_preprocessor
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize.casual import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
# -*- coding: utf-8 -*-

from sklearn.feature_extraction.text import CountVectorizer
import pickle


class nlp_preprocessor:
   
    def __init__(self, vectorizer=CountVectorizer(), tokenizer=None, 
                 cleaning_function=None, stemmer=None):
        """
        A class for pipelining our data in NLP problems. The user provides a series of 
        tools, and this class manages all of the training, transforming, and modification
        of the text data.
        ---
        Inputs:
        vectorizer: the model to use for vectorization of text data
        tokenizer: The tokenizer to use, if none defaults to split on spaces
        cleaning_function: how to clean the data, if None, defaults to the in built class
        """
        if not tokenizer:
            tokenizer = self.splitter
        if not cleaning_function:
            cleaning_function = self.clean_text
        self.stemmer = stemmer
        self.tokenizer = tokenizer
#        self.model = model
        self.cleaning_function = cleaning_function
        self.vectorizer = vectorizer
        self._is_fit = False
        self. words = None
        
    def splitter(self, text):
        """
        Default tokenizer that splits on spaces naively
        """
        return text.split(' ')
        
    def clean_text(self, text, tokenizer, stemmer):
        """
        A naive function to lowercase all works can clean them quickly.
        This is the default behavior if no other cleaning function is specified
        """
        cleaned_text = []
        for post in text:
            cleaned_words = []
            for word in tokenizer(post["tweet"]):
                low_word = word.lower()
                if stemmer:
                    low_word = stemmer.stem(low_word)
                cleaned_words.append(low_word)
            cleaned_text.append(' '.join(cleaned_words))
        return cleaned_text
    
    def fit(self, text):
        """
        Cleans the data and then fits the vectorizer with
        the user provided text
        """
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        self.vectorizer.fit(clean_text)
        self._is_fit = True
        self. words = self.vectorizer.get_feature_names()
        
    def get_words(self):
        if self._is_fit == False:
            return "Not yet fit"
        return self.words
        
    def transform(self, text):
        """
        Cleans any provided data and then transforms the data into
        a vectorized format based on the fit function. Returns the
        vectorized form of the data.
        """
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        return self.vectorizer.transform(clean_text)
    
    def save_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        pickle.dump(self.__dict__, open(filename+".mdl",'wb'))
        
    def load_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        if filename[-4:] != '.mdl':
            filename += '.mdl'
        self.__dict__ = pickle.load(open(filename,'rb'))

In [3]:
#Connect to Mongo DB
IP = "" #Put this in quotes as a string

client = MongoClient("mongodb://user_name:pass_word@%s/project4" % IP) # defaults to port 27017

db = client.project4

cursor = db.project_collection.find()

data = []
for i in range(1000):
    data.append(cursor[i])

In [None]:
data[0:100]

In [None]:
test = nlp_preprocessor(tokenizer=TweetTokenizer().tokenize, cleaning_function=None, stemmer=None)

In [None]:
test.fit(data[0:100])
vectorized_docs = test.transform(data[0:100]).toarray()

In [None]:
pd.DataFrame(vectorized_docs).sum().head()

In [None]:
test.get_words()

In [None]:
data[0:10]

In [6]:
data = pickle.load(open("journalism_tweets.pkl","rb"))

In [5]:
def clean_text(text, tokenizer, stemmer, stopwords = set(stopwords.words('english'))):
    """
    A naive function to lowercase all works can clean them quickly.
    This is the default behavior if no other cleaning function is specified
    """
    cleaned_text = []
    for post in text:
        cleaned_words = []
        for word in tokenizer(post["tweet"][2:]):
            word = re.sub(r'^https?:\/\/.*[\r\n]*', '', word, flags=re.MULTILINE)
            if word == "" or len(word) == 1:
                continue
            low_word = word.lower()
            if word_tokenize(low_word)[0].isalpha():
                if low_word not in stopwords:
                    cleaned_words.append(low_word)
        cleaned_text.append(' '.join(cleaned_words))
    return cleaned_text

In [None]:
tokenizer = TweetTokenizer().tokenize
stop_words = set(stopwords.words('english'))

cleaned_text = []
for post in data[0:100]:
    cleaned_words = []
    for word in tokenizer(post["tweet"][2:]):
        word = re.sub(r'^https?:\/\/.*[\r\n]*', '', word, flags=re.MULTILINE)
        if word == "" or len(word) == 1:
            continue
        low_word = word.lower()
        if word_tokenize(low_word)[0].isalpha():
            if low_word not in stop_words:
                print(low_word)
                cleaned_words.append(low_word)
    cleaned_text.append(' '.join(cleaned_words))
print(cleaned_text)

In [7]:
test = nlp_preprocessor(tokenizer=TweetTokenizer().tokenize, cleaning_function=clean_text, stemmer=None)

In [8]:
test.fit(data[0:100])
vectorized_docs2 = test.transform(data[0:100]).toarray()

In [10]:
pd.DataFrame(vectorized_docs2).sum().head()

0    2
1    1
2    1
3    2
4    1
dtype: int64

In [26]:
test.get_words()

['able',
 'abortionist',
 'abortions',
 'abuse',
 'abused',
 'acceptance',
 'accuser',
 'acid',
 'actually',
 'administrator',
 'admits',
 'advice',
 'alex',
 'allegations',
 'allegiance',
 'alone',
 'already',
 'also',
 'amendment',
 'america',
 'americans',
 'andrew',
 'angeles',
 'angry',
 'annog',
 'anthem',
 'anyone',
 'areas',
 'aretha',
 'article',
 'asked',
 'asks',
 'assault',
 'assisted',
 'attack',
 'attacks',
 'audience',
 'australia',
 'back',
 'backward',
 'bad',
 'ban',
 'banned',
 'barbara',
 'barring',
 'ben',
 'big',
 'bill',
 'bin',
 'biologically',
 'birth',
 'birthday',
 'bishops',
 'blasting',
 'body',
 'boycott',
 'breaking',
 'breaks',
 'brennan',
 'cakeshop',
 'calls',
 'campus',
 'candidate',
 'captor',
 'cars',
 'cash',
 'catholic',
 'causing',
 'celebrates',
 'chambers',
 'chicago',
 'child',
 'chose',
 'church',
 'cia',
 'classified',
 'clearance',
 'clinton',
 'close',
 'closest',
 'club',
 'collusion',
 'comes',
 'comey',
 'coming',
 'commander',
 'commen