In [22]:
import pandas as pd
import numpy as np
import pickle 
import re 
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer 
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer

For our web application, in order for the model to understand the text, we must perform text cleaning, preprocessing, and transformation on the user input. The most efficient way to do this, is to build preprocessing pipeline. The things we must do is: 
* Regex Preprocessing
* Tokenization 
* Vectorization

In [64]:
wpt=nltk.WordPunctTokenizer()
stop_words=nltk.corpus.stopwords.words('english')
porter=PorterStemmer()

In [86]:
def normalize_doc(doc):
    doc=re.sub(r'[^a-zA-Z\s]', '', doc) 
    doc=doc.lower() 
    doc=doc.strip() 
    tokens=wpt.tokenize(doc)
    filtered_tokens=[token for token in tokens if token not in stop_words]
    doc=' '.join(filtered_tokens)
    return doc

normalize_corpus=np.vectorize(normalize_doc)

In [93]:
sample_text = ['ginger escobedo needs access legal splash page'] 

In [94]:
#add new clean text to dataframe
normalize_corpus=np.vectorize(normalize_doc) #create a vectorized object for our normalization pipeline
norm_text=normalize_corpus(sample_text)

In [95]:
norm_text

array(['ginger escobedo needs access legal splash page'], dtype='<U46')

## Final Step - Text Transformation

In [96]:
with open ('final_model.pickle','rb') as f:#rb, read-byte
    final_model=pickle.load(f)

In [97]:
vect=final_model.named_steps['tfidf']
clf=final_model.named_steps['clf']

In [98]:
vect, clf

(TfidfVectorizer(max_df=0.75, max_features=10000, min_df=2, ngram_range=(1, 2),
                 stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                             'ourselves', 'you', "you're", "you've", "you'll",
                             "you'd", 'your', 'yours', 'yourself', 'yourselves',
                             'he', 'him', 'his', 'himself', 'she', "she's",
                             'her', 'hers', 'herself', 'it', "it's", 'its',
                             'itself', ...]),
 LogisticRegression(C=10, class_weight='balanced', random_state=42,
                    solver='liblinear'))

In [100]:
trans_text=vect.transform(norm_text).toarray()
print(clf.predict(trans_text))

[2]
