# Testing the Pipeline from Class

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer

from pipeline import NLPPipe, tweet_clean1

from helper_functions import txt_to_df

import pickle
%load_ext autoreload
%autoreload 2

In [2]:
tweets = txt_to_df('twitter_data.txt')
# Let's take all of our tweets and turn it into a DataFrame

In [6]:
corpus_list = tweets['long_text'].tolist()
# Let's turn all of the tweets into a list, so our Pipeline
# can work with our data better

In [316]:
nlp = NLPPipe(vectorizer=CountVectorizer(), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer())
# Assign our Pipeline to a variable

In [318]:
nlp.fit(corpus_list)
nlp.transform(corpus_list);
# Fit the corpus and transform the corpus

In [320]:
pd.DataFrame(nlp.transform(corpus_list).toarray(), columns=nlp.vectorizer.get_feature_names()).head()
# It looks like we have a basic Document Term matrix, but all of the terms shown seem pretty wrong.
# A good first step would be to take out strings with number, but let's see if there is anything that seems off.

Unnamed: 0,00,000,00pm,01,0102,03,0j7uuyruyq,10,100,1000,...,zoodles,zor18iugb2,zuelnb5v4a,zuhxsuhozo,zvynuecunq,zwdug9pbsp,zyjxeglhci,شرم_الشيخ,فخر_العرب,فلاح
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [333]:
nlp.vectorizer.vocabulary_['the']
# I didn't pass english stop words into the CountVectorizer, so that could be a good step to help out as well.

2827

In [7]:
nlp = NLPPipe(vectorizer=CountVectorizer(stop_words='english'), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer(),
              cleaning_function=tweet_clean1)
# Now let's add those stopwords in there, and change the cleaning function.

In [8]:
nlp.fit(corpus_list)
nlp.transform(corpus_list);
# Fit the corpus and transform the corpus

In [9]:
pd.DataFrame(nlp.transform(corpus_list).toarray(), columns=nlp.vectorizer.get_feature_names()).head()

Unnamed: 0,aaliyahchanel,ab,aberdeen,abhor,abl,absinth,absolut,abstract,accent,accept,...,zero,zerowast,zerowastecak,zipthevegan,zipup,zodiac,zoodl,شرمالشيخ,فخرالعرب,فلاح
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
nlp.vectorizer.vocabulary_

{'possibl': 1799,
 'organ': 1659,
 'polit': 1786,
 'revolut': 1949,
 'chang': 393,
 'dietari': 641,
 'habit': 1027,
 'sacrific': 1977,
 'social': 2127,
 'tast': 2266,
 'bud': 315,
 'vegan': 2487,
 'step': 2185,
 'mani': 1427,
 'peopl': 1719,
 'realiz': 1889,
 'power': 1806,
 'individu': 1180,
 'action': 19,
 'hold': 1109,
 'agre': 45,
 'thi': 2370,
 'confus': 490,
 'wit': 2610,
 'stand': 2172,
 'wrt': 2637,
 'question': 1861,
 'animalactivist': 85,
 'amp': 76,
 'say': 2002,
 'ok': 1635,
 'cut': 574,
 'endangeredspeciesact': 745,
 'drill': 695,
 'alaska': 58,
 'wildlif': 2603,
 'refug': 1911,
 'vote': 2552,
 'trump': 2435,
 'anim': 84,
 'notmypresid': 1606,
 'votehimout': 2553,
 'http': 1136,
 'tcoyzjeuorq': 2333,
 'mainstream': 1416,
 'veri': 2535,
 'misconstru': 1512,
 'fact': 803,
 'matter': 1450,
 'futur': 936,
 'veganismlif': 2508,
 'planet': 1762,
 'sustain': 2242,
 'human': 1139,
 'switch': 2249,
 'mostli': 1539,
 'plantbas': 1767,
 'eat': 713,
 'improv': 1169,
 'societi': 2129,
