In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import brown
import os

from nltk.corpus import stopwords
from collections import Counter
import nltk
#nltk.download('brown')
#nltk.download('punkt')

import gensim
from gensim.models import word2vec

In [2]:
# Million word corpus of text from 500 different sources. 
brown.raw()[:1000]

"\n\n\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.\n\n\n\tThe/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/cc thanks/nns of/in the/at City/nn-tl of/in-tl Atlanta/np-tl ''/'' for/in the/at manner/nn in/in which/wdt the/at election/nn was/bedz conducted/vbn ./.\n\n\n\tThe/at September-October/np term/nn jury/nn had/hvd been/ben charged/vbn by/in Fulton/np-tl Superior/jj-tl Court/nn-tl Judge/nn-tl Durwood/np Pye/np to/to investigate/vb reports/nns of/in possible/jj ``/`` irregularities/nns ''/'' in/in the/at hard-fought/jj primary/nn which/wdt was/bedz won/vbn by/in Mayor-nominate/nn-tl Ivan/np A

In [3]:
def text_cleaner(text):
    
    text = re.sub(r'--',' ',text)
    text = re.sub('\/\S+ ', ' ', text)
    text = re.sub(' \.\/\.', '.', text)
    text = re.sub('`` ', '\"', text)
    text = re.sub(' \\\'\\\'', '\"', text)
    text = ' '.join(text.split())
    
    return text

def bow_features(data, common_words):
    
    bow = []
    bow.append(list(data.iloc[:, 1]))
    bow.append(list(data.iloc[:, 0]))
    
    for i in range(len(common_words)):
        bow.append(list(np.zeros(len(bow[0]))))
    
    for i, text in enumerate(bow[0]):
        
        for word in text.split():
            for word2 in range(len(common_words)):
                if word == common_words[word2]:
                    bow[word2 + 2][i] += 1
    
    return bow

In [4]:
brown_clean = text_cleaner(brown.raw())
print(brown_clean[:1000])
print('\nLength of Brown:', len(brown_clean))

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced "no evidence" that any irregularities took place. The jury further said in term-end presentments that the City Executive Committee , which had over-all charge of the election , "deserves the praise and thanks of the City of Atlanta" for the manner in which the election was conducted. The September-October term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible "irregularities" in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr.. "Only a relative handful of such reports was received" , the jury said , "considering the widespread interest in the election , the number of voters and the size of this city". The jury said it did find that many of Georgia's registration and election laws "are outmoded or inadequate and often ambiguous". It recommended that Fulton legislators act "to have these laws studied and revised to

In [5]:
nlp = spacy.load('en')
nlp.max_length=6500000
brown_doc = nlp(brown_clean)

In [6]:
sentences = []
for sentence in brown_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_punct
        # Keeping stops in
    ]
    sentences.append(sentence)


print(sentences[20])
print('We have {} sentences and {} tokens.'.format(len(sentences), len(brown_clean)))

['regard', 'atlanta', "'s", 'new', 'multi', 'million', 'dollar', 'airport', 'the', 'jury', 'recommend', 'that', 'when', 'the', 'new', 'management', 'take', 'charge', 'january', '1', 'the', 'airport', 'be', 'operate', 'in', 'a', 'manner', 'that', 'will', 'eliminate', 'political', 'influence']
We have 57952 sentences and 6052357 tokens.


In [7]:
vector_size = 100

model = word2vec.Word2Vec(
    sentences,
    workers=3,     
    min_count=10,  
    window=8,      
    sg=0,          # Use CBOW because our corpus is small.
    sample=.001 ,  # Penalize frequent words.
    size=vector_size,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

done!


In [19]:
reviews = pd.read_csv('/Data Science/word2vec-nlp-tutorial/labeledTrainData.tsv', sep='\t')
reviews.drop('id', 1, inplace=True)

sentence_vector_list = []
for sentence in reviews.review:
    split = sentence.split()
    total_vec = np.zeros(vector_size)
    word_count = 0
    for word in split:
        try:
            total_vec += model.wv.get_vector(word.lower())
            word_count += 1
        except:
            None
    total_vec /= word_count
    sentence_vector_list.append(total_vec)

In [20]:
review_df = pd.DataFrame()
review_df['Sentiment'] = reviews.sentiment
for i in range(vector_size):
    vector_element_list = []
    for review in range(len(review_df)):
        vector_element_list.append(sentence_vector_list[review][i])
    review_df['Element {}'.format(i)] = vector_element_list
    
review_df['RawReview'] = reviews.review

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = review_df.iloc[:, 1:-1]
Y = review_df.Sentiment

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2)

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)



0.6304

In [46]:
num_common_words = 100
common_text = []
for text in reviews.review:
    for word in text.split():
        common_text.append(word)
common_words = [item[0] for item in Counter(common_text).most_common(num_common_words)]

word_counts = bow_features(reviews, common_words)

bow_reviews = pd.DataFrame()
bow_reviews['Sentiment'] = review_df.Sentiment
for feature in range(num_common_words):
    bow_reviews[common_words[feature]] = word_counts[feature + 2]
bow_reviews['Text'] = review_df.RawReview

In [49]:
X = bow_reviews.iloc[:, 1:-1]
Y = bow_reviews.Sentiment

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2)

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)



0.6504

In [50]:
# Larger vector size and more common words and skip gram instead of cbow

vector_size = 200

model = word2vec.Word2Vec(
    sentences,
    workers=3,     
    min_count=10,  
    window=8,      
    sg=1,          # Use CBOW because our corpus is small.
    sample=.001 ,  # Penalize frequent words.
    size=vector_size,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

done!


In [52]:
num_common_words = 500
common_text = []
for text in reviews.review:
    for word in text.split():
        common_text.append(word)
common_words = [item[0] for item in Counter(common_text).most_common(num_common_words)]

word_counts = bow_features(reviews, common_words)

bow_reviews = pd.DataFrame()
bow_reviews['Sentiment'] = review_df.Sentiment
for feature in range(num_common_words):
    bow_reviews[common_words[feature]] = word_counts[feature + 2]
bow_reviews['Text'] = review_df.RawReview

In [53]:
X = bow_reviews.iloc[:, 1:-1]
Y = bow_reviews.Sentiment

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2)

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)



0.7062

In [None]:
# Able to improve BOW model.