# Feature Creation and Text Mining in Python


### Links: [local](http://localhost:8888/notebooks/classes/12-text-mining/intro-text-python.ipynb) [github](https://github.com/AnalyticsDojo/materials/blob/master/analyticsdojo/classes/12-text-mining/intro-text-python.ipynb) [slides](http://nbviewer.jupyter.org/format/slides/github/AnalyticsDojo/materials/blob/master/analyticsdojo/classes/12-text-mining/intro-text-python.ipynb#/)

# Agenda
- Simple text fields through matching 
- Processing and using a text corpus
- Word Vectors with word2vec

In [1]:
import numpy as np
import pandas as pd



In [15]:
import csv
import urllib2

url = 'https://raw.githubusercontent.com/RPI-Analytics/MGMT6963-2015/master/data/titanic/train.csv'
url2 = 'https://raw.githubusercontent.com/RPI-Analytics/MGMT6963-2015/master/data/titanic/test.csv'
response = urllib2.urlopen(url)
response2 = urllib2.urlopen(url2)
train = pd.read_csv(response, dtype={"Age": np.float64},)
test = pd.read_csv(response2, dtype={"Age": np.float64},)
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C


In [16]:
#Print to standard output, and see the results in the "log" section below after running your script
print("\n\nTop of the training data:")
train.head()









Top of the training data:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [17]:
#Print to standard output, and see the results in the "log" section below after running your script
print("\n\nTop of the training data:")
train.describe()



Top of the training data:


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [18]:
print("Show data types")
for col in train:
    print col, train[col].dtypes

Show data types
PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object


In [75]:
#Let's look at the age field.  We can see "NaN" (which indicates missing values).s
train["Age"]

0      22
1      38
2      26
3      35
4      35
5     NaN
6      54
7       2
8      27
9      14
10      4
11     58
12     20
13     39
14     14
15     55
16      2
17    NaN
18     31
19    NaN
20     35
21     34
22     15
23     28
24      8
25     38
26    NaN
27     19
28    NaN
29    NaN
       ..
861    21
862    48
863   NaN
864    24
865    42
866    27
867    31
868   NaN
869     4
870    26
871    47
872    33
873    47
874    28
875    15
876    20
877    19
878   NaN
879    56
880    25
881    33
882    22
883    28
884    25
885    39
886    27
887    19
888   NaN
889    26
890    32
Name: Age, dtype: float64

In [19]:
#Now let's recode. 
medianAge=train["Age"].median()
print "The Median age is:", medianAge, " years old."
train["Age"] = train["Age"].fillna(medianAge)

#Option 2 all in one shot! 
train["Age"] = train["Age"].fillna(train["Age"].median())
train["Age"] 




The Median age is: 28.0  years old.


0      22
1      38
2      26
3      35
4      35
5      28
6      54
7       2
8      27
9      14
10      4
11     58
12     20
13     39
14     14
15     55
16      2
17     28
18     31
19     28
20     35
21     34
22     15
23     28
24      8
25     38
26     28
27     19
28     28
29     28
       ..
861    21
862    48
863    28
864    24
865    42
866    27
867    31
868    28
869     4
870    26
871    47
872    33
873    47
874    28
875    15
876    20
877    19
878    28
879    56
880    25
881    33
882    22
883    28
884    25
885    39
886    27
887    19
888    28
889    26
890    32
Name: Age, dtype: float64

In [20]:
#For Recoding Data, we can use what we know of selecting rows and columns
train["Embarked"] = train["Embarked"].fillna("S")
train.loc[train["Embarked"] == "S", "EmbarkedRecode"] = 0
train.loc[train["Embarked"] == "C", "EmbarkedRecode"] = 1
train.loc[train["Embarked"] == "Q", "EmbarkedRecode"] = 2

In [33]:
# We can also use something called a lambda function 
# You can read more about the lambda function here.
#http://www.python-course.eu/lambda.php 
gender_fn = lambda x: 0 if x == 'male' else 1
train['Gender'] = train['Sex'].map(gender_fn)

#or we can do in one shot
train['NameLength'] = train['Name'].map(lambda x: len(x))
train['Age2'] = train['Age'].map(lambda x: x*x)
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,EmbarkedRecode,Gender,NameLength,Age2,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.2500,,S,0,0,23,484,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,1,1,51,1444,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.9250,,S,0,1,22,676,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1000,C123,S,0,1,44,1225,1
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.0500,,S,0,0,24,1225,1
5,6,0,3,"Moran, Mr. James",male,28,0,0,330877,8.4583,,Q,2,0,16,784,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S,0,0,23,2916,1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.0750,,S,0,0,30,4,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S,0,1,49,729,1
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C,1,1,35,196,1


In [32]:

#We can start to create little small functions that will find a string.
def has_title(name):
    for s in ['Mr.', 'Mrs.', 'Miss.', 'Dr.', 'Sir.']:
        if name.find(s) >= 0:
            return True
    return False

#Now we are using that separate function in another function.  
title_fn = lambda x: 1 if has_title(x) else 0
#Finally, we call the function for name
train['Title'] = train['Name'].map(title_fn)
train


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,EmbarkedRecode,Gender,NameLength,Age2,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.2500,,S,0,0,23,484,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,1,1,51,1444,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.9250,,S,0,1,22,676,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1000,C123,S,0,1,44,1225,1
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.0500,,S,0,0,24,1225,1
5,6,0,3,"Moran, Mr. James",male,28,0,0,330877,8.4583,,Q,2,0,16,784,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S,0,0,23,2916,1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.0750,,S,0,0,30,4,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S,0,1,49,729,1
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C,1,1,35,196,1


In [12]:
#Writing to File
submission=pd.DataFrame(test.ix[:,['PassengerId','Survived']])

#Any files you save will be available in the output tab below
submission.to_csv('submission.csv', index=False)

In [13]:
#We can see the file her.  
!ls


1_twitter.ipynb				 Untitled3.ipynb
BeautifulSoup.ipynb			 _Appendix B - OAuth Primer.ipynb
Chapter 0 - Preface.ipynb		 data
Chapter 1 - Mining Twitter.ipynb	 downjason.ipynb.json
Chapter 4 - Mining Google+.ipynb	 example.Rmd
Chapter 9 - Twitter Cookbook.ipynb	 example.html
Class 3 More Python Basics. .ipynb	 index.html
Lab 3 - Twitter-Copy1.ipynb		 index.html.1
Lab 3 - Twitter.ipynb			 install.sh
Lab2-webmining.ipynb			 lab2solution.ipynb
Lab2.ipynb				 lab4.Rmd
Lab3_Twitter_solution.ipynb		 lab4.html
Lab4-Solution.ipynb			 model-figure
Lab4.ipynb				 model.Rpres
Lab6.Rmd				 model.md
Lab7 - Feature Creation in python.ipynb  nestedforloop.R
R					 spark_mooc_version
Titanic.ipynb				 spark_notebook.py
Untitled.ipynb				 submission.csv
Untitled1.ipynb				 titantic_train.csv
Untitled2.ipynb				 titantic_train.csv.1



1. Create a function that recodes the data for Name if there is a 'Mc' in it.  

## Introduction to Text Mining in Python
These exercises were adapted from Mining the Social Web, 2nd Edition [See origional here](https://github.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition/) 
Simplified BSD License that governs its use.


In [35]:
 corpus = { 
 'a' : "Mr. Green killed Colonel Mustard in the study with the candlestick. \
Mr. Green is not a very nice fellow.",
 'b' : "Professor Plum has a green plant in his study.",
 'c' : "Miss Scarlett watered Professor Plum's green plant while he was away \
from his office last week."
}

#This will separate the documents (sentences) into terms
terms = {
 'a' : [ i.lower() for i in corpus['a'].split() ],
 'b' : [ i.lower() for i in corpus['b'].split() ],
 'c' : [ i.lower() for i in corpus['c'].split() ]
 }
terms

{'a': ['mr.',
  'green',
  'killed',
  'colonel',
  'mustard',
  'in',
  'the',
  'study',
  'with',
  'the',
  'candlestick.',
  'mr.',
  'green',
  'is',
  'not',
  'a',
  'very',
  'nice',
  'fellow.'],
 'b': ['professor',
  'plum',
  'has',
  'a',
  'green',
  'plant',
  'in',
  'his',
  'study.'],
 'c': ['miss',
  'scarlett',
  'watered',
  'professor',
  "plum's",
  'green',
  'plant',
  'while',
  'he',
  'was',
  'away',
  'from',
  'his',
  'office',
  'last',
  'week.']}

In [41]:
from math import log

# XXX: Enter in a query term from the corpus variable

#This is our terms we would like to use.
QUERY_TERMS = ['mr.', 'green']

#This calculates the term frequency normalized by the length.
def tf(term, doc, normalize):
    doc = doc.lower().split()
    if normalize:
        return doc.count(term.lower()) / float(len(doc))
    else:
        return doc.count(term.lower()) / 1.0

for (k, v) in sorted(corpus.items()):
    print k, ':', v
print
    
# Score queries by calculating cumulative tf_idf score for each term in query
query_scores = {'a': 0, 'b': 0, 'c': 0}

#This starts the search for each query
for term in [t.lower() for t in QUERY_TERMS]:
    #This starts the search for each document in the corpus
    for doc in sorted(corpus):
        print 'TF(%s): %s' % (doc, term), tf(term, corpus[doc], True)
        
print       
print "This does the same thing but unnormalized."
for term in [t.lower() for t in QUERY_TERMS]:
    #This starts the search for each document in the corpus
    for doc in sorted(corpus):
        print 'TF(%s): %s' % (doc, term), tf(term, corpus[doc], False)
    
  


    

a : Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow.
b : Professor Plum has a green plant in his study.
c : Miss Scarlett watered Professor Plum's green plant while he was away from his office last week.

TF(a): mr. 0.105263157895
TF(b): mr. 0.0
TF(c): mr. 0.0
TF(a): green 0.105263157895
TF(b): green 0.111111111111
TF(c): green 0.0625

This does the same thing but unnormalized
TF(a): mr. 2.0
TF(b): mr. 0.0
TF(c): mr. 0.0
TF(a): green 2.0
TF(b): green 1.0
TF(c): green 1.0


In [43]:
def idf(term, corpus):
    num_texts_with_term = len([True for text in corpus if term.lower()
                              in text.lower().split()])

    # tf-idf calc involves multiplying against a tf value less than 0, so it's
    # necessary to return a value greater than 1 for consistent scoring. 
    # (Multiplying two values less than 1 returns a value less than each of 
    # them.)
    

    try:
        return 1.0 + log(float(len(corpus)) / num_texts_with_term)
    except ZeroDivisionError:
        return 1.0

#This 
for term in [t.lower() for t in QUERY_TERMS]:
        print 'IDF: %s' % (term, ), idf(term, corpus.values())
        




IDF: mr. 2.09861228867
IDF: green 1.0


In [46]:

#TF-IDF Just multiplies the two together
def tf_idf(term, doc, corpus):
    return tf(term, doc, True) * idf(term, corpus)

query_scores = {'a': 0, 'b': 0, 'c': 0}
for term in [t.lower() for t in QUERY_TERMS]:
    for doc in sorted(corpus):
        print 'TF(%s): %s' % (doc, term), tf(term, corpus[doc], True)
    print 'IDF: %s' % (term, ), idf(term, corpus.values())
    print

    for doc in sorted(corpus):
        score = tf_idf(term, corpus[doc], corpus.values())
        print 'TF-IDF(%s): %s' % (doc, term), score
        query_scores[doc] += score
    print

print "Overall TF-IDF scores for query '%s'" % (' '.join(QUERY_TERMS), )
for (doc, score) in sorted(query_scores.items()):
    print doc, score

TF(a): mr. 0.105263157895
TF(b): mr. 0.0
TF(c): mr. 0.0
IDF: mr. 2.09861228867

TF-IDF(a): mr. 0.220906556702
TF-IDF(b): mr. 0.0
TF-IDF(c): mr. 0.0

TF(a): green 0.105263157895
TF(b): green 0.111111111111
TF(c): green 0.0625
IDF: green 1.0

TF-IDF(a): green 0.105263157895
TF-IDF(b): green 0.111111111111
TF-IDF(c): green 0.0625

Overall TF-IDF scores for query 'mr. green'
a 0.326169714597
b 0.111111111111
c 0.0625
