In [0]:
import math
import numpy as np
import pandas as pd
import requests
import json
import re
import nltk
from nltk.tokenize import word_tokenize, MWETokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
# nltk.download('all')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [0]:
mydata = pd.read_json("http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Office_Products_5.json.gz", lines = True)

In [0]:
mydata = pd.DataFrame(mydata.iloc[:10,4])

In [0]:
mydata.head()

Unnamed: 0,reviewText
0,"I bought my first HP12C in about 1984 or so, a..."
1,WHY THIS BELATED REVIEW? I feel very obliged t...
2,I have an HP 48GX that has been kicking for mo...
3,I've started doing more finance stuff recently...
4,For simple calculations and discounted cash fl...


In [0]:
def wordtoken(string):

  # Tokens using NLTK
  words = word_tokenize(string)

  # Tokens using MWE Tokenizer
  mwe = MWETokenizer()
  mwetoken = mwe.tokenize(string.split())

  # Parts of Speech
  pos = nltk.pos_tag(words)

  # Root Words
  lem = WordNetLemmatizer()
  roots = ' '.join([lem.lemmatize(word, 'v') for word in string.split()])

  # Adjectives
  adj = []
  for i in pos:
    if i[1][:2] == 'JJ':
      adj.append(i)

  # Returning Dictionary
  mydict = dict()
  mydict["words"] = words
  mydict["mwe words"] = mwetoken
  mydict["POS"] = pos
  mydict["root words"] = roots
  mydict["adjectives"] = adj

  return mydict

In [0]:
mydata["word_tokenizer"] = mydata["reviewText"].apply(wordtoken)

In [0]:
mydata.head()

Unnamed: 0,reviewText,word_tokenizer
0,"I bought my first HP12C in about 1984 or so, a...","{'words': ['I', 'bought', 'my', 'first', 'HP12..."
1,WHY THIS BELATED REVIEW? I feel very obliged t...,"{'words': ['WHY', 'THIS', 'BELATED', 'REVIEW',..."
2,I have an HP 48GX that has been kicking for mo...,"{'words': ['I', 'have', 'an', 'HP', '48GX', 't..."
3,I've started doing more finance stuff recently...,"{'words': ['I', ''ve', 'started', 'doing', 'mo..."
4,For simple calculations and discounted cash fl...,"{'words': ['For', 'simple', 'calculations', 'a..."


In [0]:
print(mydata.iloc[0,1])

{'words': ['I', 'bought', 'my', 'first', 'HP12C', 'in', 'about', '1984', 'or', 'so', ',', 'and', 'it', 'served', 'me', 'faithfully', 'until', '2002', 'when', 'I', 'lost', 'it', 'while', 'travelling', '.', 'I', 'searched', 'for', 'another', 'one', 'to', 'replace', 'it', ',', 'but', 'found', 'one', 'difficult', 'to', 'come', 'by', 'in', 'my', 'area', '.', 'So', ',', 'I', 'decided', 'to', 'buy', 'up', 'and', 'purchased', 'an', 'HP', '49G', '.', 'What', 'a', 'mistake', '!', 'I', 'know', 'that', 'many', 'people', 'view', 'the', 'HP', '49G', '(', 'now', '49G+', ')', 'as', 'the', 'flagship', 'of', 'the', 'HP', 'line', ',', 'but', 'for', 'me', 'that', 'was', 'a', 'disaster.The', '49G', 'may', 'be', 'powerful', ',', 'but', 'using', 'it', 'is', 'extremely', 'counterintuitive', '...', 'and', 'the', 'manual', 'was', 'sketchy', 'at', 'best', '.', 'The', '12C', ',', 'on', 'the', 'other', 'hand', ',', 'does', 'what', 'I', 'need', 'in', 'a', 'way', 'that', 'makes', 'good', 'sense', 'to', 'me.If', 'you