# Testing the Pipeline from Class

In [348]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer

from pipeline import NLPPipe, tweet_clean1

from helper_functions import txt_to_df

import pickle
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [314]:
tweets = txt_to_df('twitter_data.txt')
# Let's take all of our tweets and turn it into a DataFrame

In [315]:
corpus_list = tweets['long_text'].tolist()
# Let's turn all of the tweets into a list, so our Pipeline
# can work with our data better

In [316]:
nlp = NLPPipe(vectorizer=CountVectorizer(), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer())
# Assign our Pipeline to a variable

In [318]:
nlp.fit(corpus_list)
nlp.transform(corpus_list);
# Fit the corpus and transform the corpus

In [320]:
pd.DataFrame(nlp.transform(corpus_list).toarray(), columns=nlp.vectorizer.get_feature_names()).head()
# It looks like we have a basic Document Term matrix, but all of the terms shown seem pretty wrong.
# A good first step would be to take out strings with number, but let's see if there is anything that seems off.

Unnamed: 0,00,000,00pm,01,0102,03,0j7uuyruyq,10,100,1000,...,zoodles,zor18iugb2,zuelnb5v4a,zuhxsuhozo,zvynuecunq,zwdug9pbsp,zyjxeglhci,شرم_الشيخ,فخر_العرب,فلاح
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [333]:
nlp.vectorizer.vocabulary_['the']
# I didn't pass english stop words into the CountVectorizer, so that could be a good step to help out as well.

2827

In [373]:
nlp = NLPPipe(vectorizer=CountVectorizer(stop_words='english'), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer(),
              cleaning_function=tweet_clean1)
# Now let's add those stopwords in there, and change the cleaning function.

In [374]:
nlp.fit(corpus_list)
nlp.transform(corpus_list);
# Fit the corpus and transform the corpus

In [375]:
pd.DataFrame(nlp.transform(corpus_list).toarray(), columns=nlp.vectorizer.get_feature_names()).head()

Unnamed: 0,aaliyahchanel,ab,aberdeen,abhor,abl,absinth,absolut,abstract,accent,accept,...,zero,zerowast,zerowastecak,zipthevegan,zipup,zodiac,zoodl,شرمالشيخ,فخرالعرب,فلاح
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
