In [89]:
import pandas as pd
import numpy as np
import nltk


In [90]:
# We will be looking at newsgroups text data from sklearn
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', categories=['rec.sport.baseball', 'rec.sport.hockey'])
newsgroups_test = fetch_20newsgroups(subset='test')


In [91]:
# sklearn sends us the info in a data frame. The text is in data, the labels in target
newsgroups_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [92]:
newsgroups_train['target_names']

['rec.sport.baseball', 'rec.sport.hockey']

In [93]:
documents = newsgroups_train['data']

In [94]:
# That is some messy stuff that will need a lot of cleaning
documents[0]

"From: dougb@comm.mot.com (Doug Bank)\nSubject: Re: Info needed for Cleveland tickets\nReply-To: dougb@ecs.comm.mot.com\nOrganization: Motorola Land Mobile Products Sector\nDistribution: usa\nNntp-Posting-Host: 145.1.146.35\nLines: 17\n\nIn article <1993Apr1.234031.4950@leland.Stanford.EDU>, bohnert@leland.Stanford.EDU (matthew bohnert) writes:\n\n|> I'm going to be in Cleveland Thursday, April 15 to Sunday, April 18.\n|> Does anybody know if the Tribe will be in town on those dates, and\n|> if so, who're they playing and if tickets are available?\n\nThe tribe will be in town from April 16 to the 19th.\nThere are ALWAYS tickets available! (Though they are playing Toronto,\nand many Toronto fans make the trip to Cleveland as it is easier to\nget tickets in Cleveland than in Toronto.  Either way, I seriously\ndoubt they will sell out until the end of the season.)\n\n-- \nDoug Bank                       Private Systems Division\ndougb@ecs.comm.mot.com          Motorola Communications Sect

# Preprocessing

The goal is to tokenize semantically valuable words. So, if we were going through this step by step, 
we would:
- remove Punctuation, Numbers, and stopwords
- make everything lowercase
- lem or stem
<br>

Luckily, there are some tricks to take car of a lot of that for us


In [95]:
# A regular expression removes punctuation and numbers for us
from nltk.tokenize import regexp_tokenize
regexp = "[a-zA-Z]+(?:'[a-z]+)?" 

In [96]:
# Apply the tokenizer to each doc individually to return a new set of words
token_docs = [regexp_tokenize(doc, regexp) for doc in documents]
token_docs[1]

['From',
 'gld',
 'cunixb',
 'cc',
 'columbia',
 'edu',
 'Gary',
 'L',
 'Dare',
 'Subject',
 'Re',
 'Flames',
 'Truly',
 'Brutal',
 'in',
 'Loss',
 'Nntp',
 'Posting',
 'Host',
 'cunixb',
 'cc',
 'columbia',
 'edu',
 'Reply',
 'To',
 'gld',
 'cunixb',
 'cc',
 'columbia',
 'edu',
 'Gary',
 'L',
 'Dare',
 'Organization',
 'PhDs',
 'In',
 'The',
 'Hall',
 'Distribution',
 'na',
 'Lines',
 'This',
 'game',
 'would',
 'have',
 'been',
 'great',
 'as',
 'part',
 'of',
 'a',
 'double',
 'header',
 'on',
 'ABC',
 'or',
 'ESPN',
 'the',
 'league',
 'would',
 'have',
 'been',
 'able',
 'to',
 'push',
 'back',
 'to',
 'back',
 'wins',
 'by',
 'Le',
 'Magnifique',
 'and',
 'The',
 'Great',
 'One',
 'Unfortunately',
 'the',
 'only',
 'network',
 'that',
 'would',
 'have',
 'done',
 'that',
 'was',
 'SCA',
 'seen',
 'in',
 'few',
 'areas',
 'and',
 'hard',
 'to',
 'justify',
 'as',
 'a',
 'pay',
 'channel',
 'gld',
 'Je',
 'me',
 'souviens',
 'Gary',
 'L',
 'Dare',
 'gld',
 'columbia',
 'EDU',
 'GO'

In [97]:
# There are a lot of stopwords in there. Stopwords are semantically insignificant words that appear very often in a text
# NLTK has a list of English stopwords

from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [88]:
# Remove the built in stop words from the text
no_stop_tokens = []
for doc in token_docs:
    remove_stop = [word.lower()  for word in doc if word.lower() not in stopwords.words('english')]
    no_stop_tokens.append(remove_stop)


KeyboardInterrupt: 

In [61]:
no_stop_tokens

[['dougb',
  'comm',
  'mot',
  'com',
  'doug',
  'bank',
  'subject',
  'info',
  'needed',
  'cleveland',
  'tickets',
  'reply',
  'dougb',
  'ecs',
  'comm',
  'mot',
  'com',
  'organization',
  'motorola',
  'land',
  'mobile',
  'products',
  'sector',
  'distribution',
  'usa',
  'nntp',
  'posting',
  'host',
  'lines',
  'article',
  'apr',
  'leland',
  'stanford',
  'edu',
  'bohnert',
  'leland',
  'stanford',
  'edu',
  'matthew',
  'bohnert',
  'writes',
  "i'm",
  'going',
  'cleveland',
  'thursday',
  'april',
  'sunday',
  'april',
  'anybody',
  'know',
  'tribe',
  'town',
  'dates',
  "who're",
  'playing',
  'tickets',
  'available',
  'tribe',
  'town',
  'april',
  'th',
  'always',
  'tickets',
  'available',
  'though',
  'playing',
  'toronto',
  'many',
  'toronto',
  'fans',
  'make',
  'trip',
  'cleveland',
  'easier',
  'get',
  'tickets',
  'cleveland',
  'toronto',
  'either',
  'way',
  'seriously',
  'doubt',
  'sell',
  'end',
  'season',
  'doug'

In [65]:
# Let's check out the Frequency distribution now to see if there are any more stopwords to remove
from nltk.probability import FreqDist
token_list_for_dist = [item for sublist in no_stop_tokens for item in sublist]
token_list_for_dist_join = ' '.join(token_list_for_dist)

FreqDist(token_list_for_dist).most_common(15)

[('edu', 2633),
 ('subject', 1253),
 ('lines', 1232),
 ('organization', 1213),
 ('team', 989),
 ('game', 918),
 ('writes', 883),
 ('article', 805),
 ('ca', 801),
 ('year', 780),
 ('would', 747),
 ('university', 724),
 ('com', 711),
 ('one', 679),
 ('hockey', 654)]

In [98]:
# let's add a few of those to our stopwords list
stop_words = stopwords.words('english') + ['edu', 'subject', 'lines', 'article', 'ca','com' ]


In [110]:
# Remove the built in stop words from the text
no_stop_tokens = []
for doc in token_docs:
    remove_stop = [word.lower()  for word in doc if word.lower() not in stop_words]
    no_stop_tokens.append(remove_stop)
no_stop_tokens[0][:10]

['dougb',
 'comm',
 'mot',
 'doug',
 'bank',
 'info',
 'needed',
 'cleveland',
 'tickets',
 'reply']

In [117]:
# now that we have removed stopwords, made lowercase, and removed punctuation/numbers, 
# we can do stuff like stem or lem the words
# Let's lem. 
# before we do that, we need to tag our words with POS
from nltk.corpus import wordnet

from nltk import pos_tag

pos_tag_tokens = []
for doc in no_stop_tokens:
    ' '.join(doc)
    pos_tokens = pos_tag(doc)
    pos_tag_tokens.append(pos_tokens)


In [118]:
pos_tag_tokens[0]

[('dougb', 'NN'),
 ('comm', 'NN'),
 ('mot', 'NN'),
 ('doug', 'VBD'),
 ('bank', 'NN'),
 ('info', 'NN'),
 ('needed', 'VBN'),
 ('cleveland', 'NN'),
 ('tickets', 'NNS'),
 ('reply', 'VBP'),
 ('dougb', 'JJ'),
 ('ecs', 'NN'),
 ('comm', 'NN'),
 ('mot', 'NN'),
 ('organization', 'NN'),
 ('motorola', 'NN'),
 ('land', 'VBP'),
 ('mobile', 'JJ'),
 ('products', 'NNS'),
 ('sector', 'NN'),
 ('distribution', 'NN'),
 ('usa', 'JJ'),
 ('nntp', 'NN'),
 ('posting', 'VBG'),
 ('host', 'NN'),
 ('apr', 'NN'),
 ('leland', 'NN'),
 ('stanford', 'NN'),
 ('bohnert', 'NN'),
 ('leland', 'NN'),
 ('stanford', 'NN'),
 ('matthew', 'NN'),
 ('bohnert', 'NN'),
 ('writes', 'VBZ'),
 ("i'm", 'IN'),
 ('going', 'VBG'),
 ('cleveland', 'NN'),
 ('thursday', 'NN'),
 ('april', 'IN'),
 ('sunday', 'JJ'),
 ('april', 'IN'),
 ('anybody', 'NN'),
 ('know', 'VBP'),
 ('tribe', 'JJ'),
 ('town', 'NN'),
 ('dates', 'NNS'),
 ("who're", 'VBP'),
 ('playing', 'VBG'),
 ('tickets', 'NNS'),
 ('available', 'JJ'),
 ('tribe', 'JJ'),
 ('town', 'NN'),
 ('april

In [122]:
# use the function below 
def get_wordnet_pos(treebank_tag):
    '''
    Translate nltk POS to wordnet tags
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

text_for_lem = []
for doc in pos_tag_tokens:
    text_for_lem.append([(word[0], get_wordnet_pos(word[1])) for word in doc])

In [125]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
lemmed_words =[]
for doc in text_for_lem:
    lemmed_words.append([wordnet_lemmatizer.lemmatize(word[0],word[1]) for word in doc])

In [128]:
lemmed_words[1]

['gld',
 'cunixb',
 'cc',
 'columbia',
 'gary',
 'l',
 'dare',
 'flame',
 'truly',
 'brutal',
 'loss',
 'nntp',
 'post',
 'host',
 'cunixb',
 'cc',
 'columbia',
 'reply',
 'gld',
 'cunixb',
 'cc',
 'columbia',
 'gary',
 'l',
 'dare',
 'organization',
 'phd',
 'hall',
 'distribution',
 'na',
 'game',
 'would',
 'great',
 'part',
 'double',
 'header',
 'abc',
 'espn',
 'league',
 'would',
 'able',
 'push',
 'back',
 'back',
 'win',
 'le',
 'magnifique',
 'great',
 'one',
 'unfortunately',
 'network',
 'would',
 'do',
 'sca',
 'see',
 'area',
 'hard',
 'justify',
 'pay',
 'channel',
 'gld',
 'je',
 'souviens',
 'gary',
 'l',
 'dare',
 'gld',
 'columbia',
 'go',
 'winnipeg',
 'jet',
 'go',
 'gld',
 'cunixc',
 'bitnet',
 'selanne',
 'domi',
 'stanley']

# CV and TFIDF

### Term Frequency (TF)


$\begin{align}
tf(w) = \dfrac{single\ word\ count}{total\ number\ of\ words\ in\ document}
\end{align} $

### Inverse Document Frequency (IDF)

$\begin{align}
idf(w) = \log \dfrac{Number\ of\ docs}{Number\ of\ docs\ word\ is\ found}
\end{align} $

$\begin{align}
tfidf = {tf}*{idf}
\end{align} $

In [132]:
# these vectorizers expect strings, so once again, let's create documents with strings in them
joined_lems = []
for doc in lemmed_words:
    joined_lems.append(' '.join(doc))
joined_lems[0]

"dougb comm mot doug bank info need cleveland ticket reply dougb ec comm mot organization motorola land mobile product sector distribution usa nntp post host apr leland stanford bohnert leland stanford matthew bohnert write i'm go cleveland thursday april sunday april anybody know tribe town date who're play ticket available tribe town april th always ticket available though play toronto many toronto fan make trip cleveland easy get ticket cleveland toronto either way seriously doubt sell end season doug bank private system division dougb ecs comm mot motorola communication sector dougb nwu schaumburg illinois dougb casbah acns nwu"

In [138]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# in CountVectorizer, we see that we have parameters for stopwords, lowercase, token_pattern, and ngram_range
cv = CountVectorizer(ngram_range = (1,2))

print('that is a huge sparse matrix')
cv.fit_transform(joined_lems)


that is a huge sparse matrix


<1197x110191 sparse matrix of type '<class 'numpy.int64'>'
	with 274477 stored elements in Compressed Sparse Row format>

In [140]:
# let's look at the vocabulary
cv.vocabulary_

{'dougb': 25234,
 'comm': 17917,
 'mot': 60650,
 'doug': 25207,
 'bank': 6879,
 'info': 44889,
 'need': 62300,
 'cleveland': 16639,
 'ticket': 97717,
 'reply': 79306,
 'ec': 26324,
 'organization': 66992,
 'motorola': 60680,
 'land': 50148,
 'mobile': 59904,
 'product': 75218,
 'sector': 84976,
 'distribution': 24469,
 'usa': 102085,
 'nntp': 64011,
 'post': 73760,
 'host': 42513,
 'apr': 4172,
 'leland': 51779,
 'stanford': 90708,
 'bohnert': 10132,
 'matthew': 56918,
 'write': 107898,
 'go': 36589,
 'thursday': 97664,
 'april': 4308,
 'sunday': 93239,
 'anybody': 3628,
 'know': 49107,
 'tribe': 99664,
 'town': 99273,
 'date': 21519,
 'who': 105841,
 're': 77585,
 'play': 71829,
 'available': 5665,
 'th': 96069,
 'always': 2667,
 'though': 97252,
 'toronto': 98917,
 'many': 56143,
 'fan': 30122,
 'make': 55547,
 'trip': 99716,
 'easy': 26273,
 'get': 35456,
 'either': 26794,
 'way': 104521,
 'seriously': 85975,
 'doubt': 25169,
 'sell': 85488,
 'end': 27295,
 'season': 84384,
 'privat

In [142]:
# let's try with tfidf
tfidf = TfidfVectorizer(ngram_range=(1,2))
tfidf.fit_transform(joined_lems)

<1197x110191 sparse matrix of type '<class 'numpy.float64'>'
	with 274477 stored elements in Compressed Sparse Row format>

In [143]:
# same vocab
tfidf.vocabulary_

{'dougb': 25234,
 'comm': 17917,
 'mot': 60650,
 'doug': 25207,
 'bank': 6879,
 'info': 44889,
 'need': 62300,
 'cleveland': 16639,
 'ticket': 97717,
 'reply': 79306,
 'ec': 26324,
 'organization': 66992,
 'motorola': 60680,
 'land': 50148,
 'mobile': 59904,
 'product': 75218,
 'sector': 84976,
 'distribution': 24469,
 'usa': 102085,
 'nntp': 64011,
 'post': 73760,
 'host': 42513,
 'apr': 4172,
 'leland': 51779,
 'stanford': 90708,
 'bohnert': 10132,
 'matthew': 56918,
 'write': 107898,
 'go': 36589,
 'thursday': 97664,
 'april': 4308,
 'sunday': 93239,
 'anybody': 3628,
 'know': 49107,
 'tribe': 99664,
 'town': 99273,
 'date': 21519,
 'who': 105841,
 're': 77585,
 'play': 71829,
 'available': 5665,
 'th': 96069,
 'always': 2667,
 'though': 97252,
 'toronto': 98917,
 'many': 56143,
 'fan': 30122,
 'make': 55547,
 'trip': 99716,
 'easy': 26273,
 'get': 35456,
 'either': 26794,
 'way': 104521,
 'seriously': 85975,
 'doubt': 25169,
 'sell': 85488,
 'end': 27295,
 'season': 84384,
 'privat

In [147]:
# but differnet values in the matrix
tf_df= pd.DataFrame(tfidf.fit_transform(joined_lems).toarray())
tf_df.columns = tfidf.vocabulary_

In [153]:
for item, index in zip(tf_df.iloc[1,:], tf_df.iloc[1,:].index):
    if item != 0:
        print(item, index)

0.05900808846009515 ticket cleveland
0.09746089129466987 illinois dougb
0.04969632029238826 would great
0.09746089129466987 justify pay
0.059740434050687935 improve
0.09746089129466987 fedorov
0.072254283202437 jokerit perhaps
0.08095094179922148 player play
0.0883260228225424 fact majority
0.05702754493765269 series anybody
0.0685040433399859 least game
0.08298246731732788 renberg rundqvist
0.0883260228225424 rundqvist play
0.13974297871147556 previously
0.1880436145648997 introduce
0.07181607332709401 injury player
0.09746089129466987 really recovered
0.2319225864075256 nd rd
0.12834948853946052 also finish
0.0685040433399859 second though
0.06779150065235873 play season
0.19252423280919076 handle backhander
0.19252423280919076 backhander worry
0.0685040433399859 worry rebound
0.0685040433399859 rebound barnaby
0.19097358354124372 yvon ramsey
0.09746089129466987 hot cold
0.0685040433399859 cold streak
0.0685040433399859 ok guy
0.03521789631504628 wkuvx
0.04551545691216771 sportchanne

In [154]:
tf_df.shape

(1197, 110191)

In [159]:
# these dataframes are now ready for our modeling techniques
from sklearn.model_selection import train_test_split

y = newsgroups_train['target']

X_train, X_test, y_train, y_test = train_test_split(tf_df,y, test_size=.3, random_state=42 )

from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)
mnb.score(X_test, y_test)

0.9666666666666667