# Understanding How Machines Read

## Creating a corpus

In [1]:
text_1 = 'The quick brown fox jumps over the lazy dog.'
text_2 = 'My dog is quick and can jump over fences.'
text_3 = 'Your dog is so lazy that it sleeps all the day.'
corpus = [text_1, text_2, text_3]

## Performing feature extraction

In [2]:
from sklearn.feature_extraction import text
vectorizer = text.CountVectorizer(binary=True).fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense())

[[0 0 1 0 0 1 0 1 0 0 0 1 1 0 1 1 0 0 0 1 0]
 [0 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 0 0 0 0 0]
 [1 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 1]]


## Understanding the BoW

In [3]:
print(vectorizer.vocabulary_)

{'the': 19, 'quick': 15, 'brown': 2, 'fox': 7, 'jumps': 11, 'over': 14, 'lazy': 12, 'dog': 5, 'my': 13, 'is': 8, 'and': 1, 'can': 3, 'jump': 10, 'fences': 6, 'your': 20, 'so': 17, 'that': 18, 'it': 9, 'sleeps': 16, 'all': 0, 'day': 4}


In [4]:
from collections import OrderedDict
ordered = OrderedDict(sorted(
    vectorizer.vocabulary_.items(), 
    key=lambda x: x[1]))
print(dict(ordered))

{'all': 0, 'and': 1, 'brown': 2, 'can': 3, 'day': 4, 'dog': 5, 'fences': 6, 'fox': 7, 'is': 8, 'it': 9, 'jump': 10, 'jumps': 11, 'lazy': 12, 'my': 13, 'over': 14, 'quick': 15, 'sleeps': 16, 'so': 17, 'that': 18, 'the': 19, 'your': 20}


## Processing and enhancing text

### Performing word counting

In [5]:
text_4 = 'A black dog just passed by but my dog is brown.'
corpus.append(text_4)

vectorizer = text.CountVectorizer().fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense()[-1])

[[0 0 1 1 1 1 0 0 2 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0]]


### Changing weights using TF-IDF

In [6]:
TfidF = text.TfidfTransformer(norm='l1')
tfidf = TfidF.fit_transform(vectorized_text)

phrase = 3 # choose a number from 0 to 3
total = 0
for word in vectorizer.vocabulary_:
    pos = vectorizer.vocabulary_[word]
    value = list(tfidf.toarray()[phrase])[pos]
    if value !=0:
        print ("%10s: %0.3f" % (word, value))
        total += value
print ('\nSummed values of a phrase: %0.1f' % total)

     brown: 0.095
       dog: 0.126
        my: 0.095
        is: 0.077
     black: 0.121
      just: 0.121
    passed: 0.121
        by: 0.121
       but: 0.121

Summed values of a phrase: 1.0


### Maintaining order using n-grams

In [7]:
bigrams = text.CountVectorizer(ngram_range=(2,2))
ord_bigrams = OrderedDict(sorted(
    bigrams.fit(corpus).vocabulary_.items(), 
    key=lambda x: x[1]))
print(dict(ord_bigrams))

{'all the': 0, 'and can': 1, 'black dog': 2, 'brown fox': 3, 'but my': 4, 'by but': 5, 'can jump': 6, 'dog is': 7, 'dog just': 8, 'fox jumps': 9, 'is brown': 10, 'is quick': 11, 'is so': 12, 'it sleeps': 13, 'jump over': 14, 'jumps over': 15, 'just passed': 16, 'lazy dog': 17, 'lazy that': 18, 'my dog': 19, 'over fences': 20, 'over the': 21, 'passed by': 22, 'quick and': 23, 'quick brown': 24, 'sleeps all': 25, 'so lazy': 26, 'that it': 27, 'the day': 28, 'the lazy': 29, 'the quick': 30, 'your dog': 31}


## Stemming and removing stop words

In [8]:
from sklearn.feature_extraction import text

import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')

stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

vocab = ['Sam loves swimming so he swims all the time']
vect = text.CountVectorizer(tokenizer=tokenize, 
                           stop_words='english')
vec = vect.fit(vocab)

sentence1 = vec.transform(['George loves swimming too!'])

print (vec.get_feature_names())
print (sentence1.toarray())

['love', 'sam', 'swim', 'time']
[[1 0 1 0]]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\John\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Scraping Textual Datasets from the Web

### Using Beautiful Soup in your code

In [9]:
from bs4 import BeautifulSoup
import pandas as pd
try:
    import urllib2 # Python 2.7.x
except:
    import urllib.request as urllib2 # Python 3.x

wiki = "https://en.wikipedia.org/wiki/\
List_of_United_States_cities_by_population"
header = {'User-Agent': 'Mozilla/5.0'} 
query = urllib2.Request(wiki, headers=header)
page = urllib2.urlopen(query)
soup = BeautifulSoup(page, "lxml")

In [10]:
table = soup.find("table", 
    { "class" : "wikitable sortable" })
final_table = list()
for row in table.findAll('tr'):
    cells = row.findAll("td")
    if len(cells) >=6:
        v1 = cells[1].find(text=True)
        v2 = cells[2].find(text=True)
        v3 = cells[3].find(text=True)
        v4 = cells[4].find(text=True)
        v5 = cells[6].findAll(text=True)
        final_table.append([v1, v2, v3, v4, v5])
cols = ['City','State','Population_2017','Census_2010'
        ,'Land_Area_km2']
df = pd.DataFrame(final_table, columns=cols)

print(df[['City', 'Population_2017']])

                 City Population_2017
0       New York City      8,622,698

1         Los Angeles      3,999,759

2             Chicago      2,716,450

3             Houston      2,312,717

4             Phoenix      1,626,078

5        Philadelphia      1,580,863

6         San Antonio      1,511,946

7           San Diego      1,419,516

8              Dallas      1,341,075

9            San Jose      1,035,317

10             Austin        950,715

11       Jacksonville        892,062

12      San Francisco        884,363

13           Columbus        879,170

14         Fort Worth        874,168

15       Indianapolis        863,002

16          Charlotte        859,035

17            Seattle        724,745

18             Denver        704,621

19   Washington, D.C.        693,972

20             Boston        685,094

21            El Paso        683,577

22            Detroit        673,104

23          Nashville        667,560

24            Memphis        652,236

25          

## Handling problems with raw text

### Dealing with encoding

In [11]:
import sys
sys.getdefaultencoding()

'utf-8'

In [12]:
utf8_string = "Hello there!"
utf7_string = utf8_string.encode('utf7')
print(utf7_string, type(utf7_string))

utf7_string = "This is a new string!".encode("utf7")
utf8_string = utf7_string.decode('utf8')
print(utf8_string, type(utf8_string))

b'Hello there!' <class 'bytes'>
This is a new string! <class 'str'>


In [13]:
utf8_string = "Hello there!"
utf32_string = utf8_string.encode('utf32')
print(utf32_string, type(utf32_string))

b'\xff\xfe\x00\x00H\x00\x00\x00e\x00\x00\x00l\x00\x00\x00l\x00\x00\x00o\x00\x00\x00 \x00\x00\x00t\x00\x00\x00h\x00\x00\x00e\x00\x00\x00r\x00\x00\x00e\x00\x00\x00!\x00\x00\x00' <class 'bytes'>


In [14]:
import locale
locale.getpreferredencoding()

'cp1252'

### Considering Unicode

In [15]:
uString1 = "This is a winking face: \N{WINKING FACE}"
print(uString1)

uString2 = "This is a winking face: \U0001F609"
print(uString2)

uString3 = "This is not a winking face: \u1F609"
print(uString3)

This is a winking face: 😉
This is a winking face: 😉
This is not a winking face: ὠ9


In [16]:
utf7_string = uString1.encode('utf7')
print(utf7_string)

uString4 = utf7_string.decode('utf8')
print(uString4)

b'This is a winking face: +2D3eCQ-'
This is a winking face: +2D3eCQ-


In [17]:
utf7_string = uString1.encode('ascii', 'namereplace')
print(utf7_string)

uString4 = utf7_string.decode('utf8', 'replace')
print(uString4)

b'This is a winking face: \\N{WINKING FACE}'
This is a winking face: \N{WINKING FACE}


## Storing processed text data in sparse matrices

### Creating a sparse matrix

In [18]:
from scipy.sparse import csr_matrix

full_matrix = vectorized_text.todense()
print(full_matrix)

sparse_matrix = csr_matrix(full_matrix)
print(sparse_matrix)

[[0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 1 0 1 0 1 0 0 0 2 0]
 [0 1 0 0 0 0 1 0 1 1 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0]
 [1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0 1 1 1 1 1]
 [0 0 1 1 1 1 0 0 2 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0]]
  (0, 3)	1
  (0, 8)	1
  (0, 10)	1
  (0, 14)	1
  (0, 16)	1
  (0, 18)	1
  (0, 20)	1
  (0, 24)	2
  (1, 1)	1
  (1, 6)	1
  (1, 8)	1
  (1, 9)	1
  (1, 11)	1
  (1, 13)	1
  (1, 17)	1
  (1, 18)	1
  (1, 20)	1
  (2, 0)	1
  (2, 7)	1
  (2, 8)	1
  (2, 11)	1
  (2, 12)	1
  (2, 16)	1
  (2, 21)	1
  (2, 22)	1
  (2, 23)	1
  (2, 24)	1
  (2, 25)	1
  (3, 2)	1
  (3, 3)	1
  (3, 4)	1
  (3, 5)	1
  (3, 8)	2
  (3, 11)	1
  (3, 15)	1
  (3, 17)	1
  (3, 19)	1


### Using the MovieLens sparse matrix

In [19]:
import urllib.request
import os.path
from zipfile import ZipFile

filename = "ml-20m.zip"
if not os.path.exists("ml-20m.zip"):
    url = "http://files.grouplens.org/datasets/\
movielens/ml-20m.zip"
    urllib.request.urlretrieve(url, filename)
    
archive = ZipFile(filename)
archive.extractall()

In [20]:
ratings = pd.read_csv("ml-20m/ratings.csv")
print(ratings.shape)
print(ratings.head())

(20000263, 4)
   userId  movieId  rating   timestamp
0       1        2     3.5  1112486027
1       1       29     3.5  1112484676
2       1       32     3.5  1112484819
3       1       47     3.5  1112484727
4       1       50     3.5  1112484580


In [21]:
names = pd.read_csv("ml-20m/movies.csv")
print(names.shape)
print(names.head())

(27278, 3)
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [22]:
movie_data = pd.merge(names, ratings, on="movieId")
print(movie_data.shape)
print(movie_data.head())

(20000263, 6)
   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  rating   timestamp  
0       3     4.0   944919407  
1       6     5.0   858275452  
2       8     4.0   833981871  
3      10     4.0   943497887  
4      11     4.5  1230858821  


In [23]:
print(movie_data.groupby('title')['rating'].mean()
.sort_values().head())

title
Magic Christmas Tree, The (1964)               0.5
Vampir (Cuadecuc, vampir) (1971)               0.5
Prisoner of Zenda, The (1979)                  0.5
Late Great Planet Earth, The (1979)            0.5
Last Warrior, The (Last Patrol, The) (2000)    0.5
Name: rating, dtype: float64


# Using Scoring and Classification

## Performing classification tasks

In [24]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, 
    categories = ['misc.forsale'],
     remove=('headers', 'footers', 'quotes'), random_state=101)
print ('Posts: %i' % len(dataset.data))

Posts: 585


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.95, 
            min_df=2, stop_words='english')
tfidf = vectorizer.fit_transform(dataset.data)
from sklearn.decomposition import NMF
n_topics = 5
nmf = NMF(n_components=n_topics, random_state=101).fit(tfidf)

In [26]:
feature_names = vectorizer.get_feature_names()
n_top_words = 15
for topic_idx, topic in enumerate(nmf.components_):
   print ("Topic #%d:" % (topic_idx+1),)
   print (" ".join([feature_names[i] for i in 
                    topic.argsort()[:-n_top_words - 1:-1]]))

Topic #1:
condition excellent asking offer best car old new sale 10 miles 000 tape cd power
Topic #2:
00 50 dos 20 10 15 cover 1st new 25 price man 40 shipping comics
Topic #3:
drive hard card floppy monitor meg ram disk motherboard vga modem brand scsi color internal
Topic #4:
email looking game games send interested mail thanks like edu good want package price list
Topic #5:
shipping vcr works stereo obo included amp plus great volume unc mathes gibbs radley remotes


In [27]:
print (nmf.components_[0,:].argsort()[:-n_top_words-1:-1])

[1075 1459  632 2463  740  888 2476 2415 2987   10 2305    1 3349  923
 2680]


In [28]:
print (vectorizer.get_feature_names()[1337])

drive


## Analyzing reviews from e-commerce

In [29]:
import urllib.request as urllib2
import requests, io, os, zipfile

UCI_url = 'https://archive.ics.uci.edu/ml/\
machine-learning-databases/00331/sentiment%20\
labelled%20sentences.zip'

response = requests.get(UCI_url)
compressed_file = io.BytesIO(response.content)
z = zipfile.ZipFile(compressed_file)
print ('Extracting in %s' %  os.getcwd())
for name in z.namelist():
    filename = name.split('/')[-1]
    nameOK = ('MACOSX' not in name and '.DS' not in name)
    if filename and nameOK:
            newfile = os.path.join(os.getcwd(), 
                               os.path.basename(filename))
            with open(newfile, 'wb') as f:
                f.write(z.read(name))
            print ('\tunzipping %s' % newfile)

Extracting in C:\Users\John\DSPD
	unzipping C:\Users\John\DSPD\amazon_cells_labelled.txt
	unzipping C:\Users\John\DSPD\imdb_labelled.txt
	unzipping C:\Users\John\DSPD\readme.txt
	unzipping C:\Users\John\DSPD\yelp_labelled.txt


In [30]:
import numpy as np
import pandas as pd
dataset = 'imdb_labelled.txt'
data = pd.read_csv(dataset, header=None, sep=r"\t",
                   engine='python')
data.columns = ['review','sentiment']

In [31]:
from sklearn.cross_validation import train_test_split
corpus, test_corpus, y, yt = train_test_split(
    data.ix[:,0], data.ix[:,1], 
    test_size=0.25, random_state=101)

In [32]:
from sklearn.feature_extraction import text
vectorizer = text.CountVectorizer(ngram_range=(1,2), 
                    stop_words='english').fit(corpus)
TfidF = text.TfidfTransformer()
X = TfidF.fit_transform(vectorizer.transform(corpus))
Xt = TfidF.transform(vectorizer.transform(test_corpus))

In [33]:
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
param_grid = {'C': [0.01, 0.1, 1.0, 10.0, 100.0]}
clf = GridSearchCV(LinearSVC(loss='hinge', 
                    random_state=101), param_grid)
clf = clf.fit(X, y)
print ("Best parameters: %s" % clf.best_params_)

Best parameters: {'C': 1.0}


In [34]:
from sklearn.metrics import accuracy_score
solution = clf.predict(Xt)
print("Achieved accuracy: %0.3f" % 
      accuracy_score(yt, solution))

Achieved accuracy: 0.816


In [35]:
print(test_corpus[yt!=solution])

601    There is simply no excuse for something this p...
32     This is the kind of money that is wasted prope...
887    At any rate this film stinks, its not funny, a...
668    Speaking of the music, it is unbearably predic...
408         It really created a unique feeling though.  
413         The camera really likes her in this movie.  
138    I saw "Mirrormask" last night and it was an un...
132    This was a poor remake of "My Best Friends Wed...
291                               Rating: 1 out of 10.  
904    I'm so sorry but I really can't recommend it t...
410    A world better than 95% of the garbage in the ...
55     But I recommend waiting for their future effor...
826    The film deserves strong kudos for taking this...
100            I don't think you will be disappointed.  
352                                    It is shameful.  
171    This movie now joins Revenge of the Boogeyman ...
814    You share General Loewenhielm's exquisite joy ...
218    It's this pandering to t