Challenge: Build your own NLP model

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [2]:
import nltk

For this challenge, you will need to choose a corpus of data from nltk or another source that includes categories you can predict and create an analysis pipeline that includes the following steps:

1. Data cleaning / processing / language parsing

In [3]:
# Import the data for movie reviews and look at its contents
# then we will clean the data
from nltk.corpus import movie_reviews, stopwords

In [4]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [5]:
print ('total reviews', len(movie_reviews.fileids()))
print ('categories', movie_reviews.categories())
print ('positive count', len(movie_reviews.fileids('pos')))
print ('negative count', len(movie_reviews.fileids('neg')))

total reviews 2000
categories ['neg', 'pos']
positive count 1000
negative count 1000


In [6]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
 
print(len(documents))
print('')
print(documents[0])

2000

(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg')


In [7]:
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [8]:
from random import shuffle 
shuffle(documents)

In [9]:
all_words = [word.lower() for word in movie_reviews.words()]

In [10]:
all_words[0:5]

['plot', ':', 'two', 'teen', 'couples']

In [11]:
from nltk import FreqDist
 
all_words_frequency = FreqDist(all_words)
 
print (all_words_frequency)
print('')
# print 10 most frequently occurring words
print (all_words_frequency.most_common(10))

<FreqDist with 39768 samples and 1583820 outcomes>

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822)]


In [12]:
stopwords_english = stopwords.words('english')
# create a new list of words by removing stopwords from all_words
all_words_clean = [word for word in all_words if word not in stopwords_english]
 
# print the first 10 words
print(all_words_clean[:10])

['plot', ':', 'two', 'teen', 'couples', 'go', 'church', 'party', ',', 'drink']


In [13]:
import string
# create a new list of words by removing punctuation from all_words
all_words_clean = [word for word in all_words_clean if word not in string.punctuation]
 
# print the first 10 words
print (all_words_clean[:10])

['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get']


In [14]:
all_words_frequency = FreqDist(all_words_clean)
print (all_words_frequency)

<FreqDist with 39586 samples and 710578 outcomes>


In [15]:
most_common_words = all_words_frequency.most_common(2000)
print (most_common_words[:10])
print('')
print (most_common_words[1990:])
print('')
word_features = [item[0] for item in most_common_words]
print (word_features[:10])

[('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049)]

[('asking', 64), ('niro', 64), ('path', 64), ('aware', 64), ('remain', 64), ('rain', 64), ('exact', 64), ('international', 64), ('moved', 64), ('anna', 64)]

['film', 'one', 'movie', 'like', 'even', 'good', 'time', 'story', 'would', 'much']


In [16]:
import spacy

In [17]:
nlp = spacy.load('en')

In [18]:
sentences = [nlp(str(parag[0])) for parag in documents]

In [19]:
pos_ = [parag[1] for parag in documents]

In [20]:
sentences = [(nlp(str(parag[0])), parag[1]) for parag in documents]

In [21]:
l = []
for parag in documents:
    p = str(parag[0])
    l.append(nlp(''.join(p)))

In [22]:
sentences = [sent for sent in l for sub in sent.sents]

In [23]:
len(sentences)

2002

In [24]:
len(pos_)

2000

In [25]:
sentences = sentences[:len(sentences)-2]

In [26]:
len(sentences)

2000

2. Create features using two different NLP methods: For example, BoW vs tf-idf.
3. Use the features to fit supervised learning models for each feature set to predict the category outcomes.

In [27]:
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences['sentences']
    df['text_source'] = sentences['sentiment']
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

In [28]:
from collections import Counter
# Set up the bags.
allwords = [bag_of_words(sent) for sent in sentences]

In [29]:
allwords[0:10]

[['dicillo', 'direct', 'superficial', 'tom'],
 ['little', 'actually', 'saint'],
 ['s', 'line', 'thin'],
 ['film', 'see', 'worth'],
 ['year', '14', 'lampoon', 'ago', 'national'],
 ['fiction', 'realm', 'science'],
 ['good', 'formula', 'feel'],
 ['reflect', 'bedazzle'],
 ['douglas', 'kirk', 'rare'],
 ['thing']]

In [30]:
common_word = set([sent for words in allwords for sent in words])

In [31]:
sentences_df = pd.DataFrame({'sentences': sentences, 'sentiment':pos_})

In [32]:
len(common_word)

2444

In [33]:
len(most_common_words)

2000

In [34]:
clean_common_word = [word for word in common_word if word not in stopwords_english]
clean_common_word = [word for word in clean_common_word if word not in string.punctuation]
len(clean_common_word)                   

2423

In [35]:
word_counts = bow_features(sentences_df, clean_common_word)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500


Unnamed: 0,attitude,travel,inch,miracle,coincide,rosenthal,mess,nicolas,tod,describe,...,trekkie,triumphant,chew,ghost,beauty,highly,lange,kar,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"([, ', tom, ', ,, ', dicillo, ', ,, ', directs...",neg
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"([, ', the, ', ,, ', saint, ', ,, ', was, ', ,...",pos
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"([, ', there, ', ,, "", ', "", ,, ', s, ', ,, ',...",pos
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"([, ', this, ', ,, ', film, ', ,, ', is, ', ,,...",neg
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"([, ', 14, ', ,, ', years, ', ,, ', ago, ', ,,...",neg


In [36]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9575

Test set score: 0.52


In [37]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(1200, 2423) (1200,)
Training set score: 0.9108333333333334

Test set score: 0.5125


In [38]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.6916666666666667

Test set score: 0.4875


In [48]:
# random forest had the highest test score
from sklearn.model_selection import cross_val_score
print(cross_val_score(rfc.fit(X, Y), X, Y).mean())

0.5044985104865345


In [40]:
sentences_df = sentences_df.astype('str')

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test = train_test_split(sentences_df, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

In [42]:
#Applying the vectorizer
tfidf = vectorizer.fit_transform(sentences_df['sentences'])
print("Number of features: %d" % tfidf.get_shape()[1])

Number of features: 848


In [43]:
#splitting into training and test sets
X_train2, X_test2 = train_test_split(tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_csr = X_train2.tocsr()

#number of paragraphs
n = X_train_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_csr[i, j]

In [44]:
#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train.loc[891])

Original sentence: sentences    ['in', '"', 'magic', 'town', '"', ',', 'jimmy'...
sentiment                                                  pos
Name: 891, dtype: object


In [45]:
print('Tf_idf vector:', tfidf_bypara[891])

Tf_idf vector: {'review': 0.3529432108644767, 'entertainment': 0.5261192114969359, 'fully': 0.5470967305689717, 'loaded': 0.5470967305689717}


In [46]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train2)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 50.379780636707395
Component 0:
(['any', 'movie', 'about', 'the', 'underground', ...], pos)            0.986503
(['as', 'with', 'any', 'gen', '-', 'x', 'mtv', 'movie', ...], pos)     0.986503
(['ugh', '.', 'that', 'about', 'sums', 'this', 'movie', ...], neg)     0.986503
(['this', 'is', 'the', 'movie', 'not', 'the', 'perfume', ...], pos)    0.986503
(['this', 'is', 'the', 'movie', 'that', 'could', ...], neg)            0.986503
(['here', 'is', 'a', 'movie', 'that', 'sadly', ...], neg)              0.986503
(['"', 'gordy', '"', 'is', 'not', 'a', 'movie', ',', ...], pos)        0.986503
(['i', 'looked', 'at', 'the', '"', 'internet', 'movie', ...], neg)     0.986503
(['a', 'movie', 'about', 'divorce', 'and', 'custody', ...], neg)       0.986503
(['a', 'movie', 'that', "'", 's', 'been', 'as', ...], pos)             0.986503
Name: 0, dtype: float64
Component 1:
(['every', 'once', 'in', 'a', 'while', ',', 'a', 'film', ...], pos)    0.988513
(['eve

5. Pick one of the models and try to increase accuracy by at least 5 percentage points.

In [49]:
# BOW vastly overfit and performer worse. lets try to improve idf
vectorizer = TfidfVectorizer(max_df=0.35, # drop words that occur in more than half the paragraphs
                             min_df=1, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

tfidf = vectorizer.fit_transform(sentences_df['sentences'])
print("Number of features: %d" % tfidf.get_shape()[1])

#splitting into training and test sets
X_train2, X_test2 = train_test_split(tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_csr = X_train2.tocsr()

#number of paragraphs
n = X_train_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_csr[i, j]

Number of features: 2702


In [50]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train2)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

Percent variance captured by all components: 33.93463522067122


In [56]:
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=3, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

tfidf = vectorizer.fit_transform(sentences_df['sentences'])
print("Number of features: %d" % tfidf.get_shape()[1])

#splitting into training and test sets
X_train2, X_test2 = train_test_split(tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_csr = X_train2.tocsr()

#number of paragraphs
n = X_train_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_csr[i, j]

Number of features: 432


In [57]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train2)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

Percent variance captured by all components: 65.74068325145156
