In [1]:
import pandas as pd
import gzip
import json
import nlp_lib as nl
from sklearn.feature_extraction.text import CountVectorizer
from sklean.model_selection

## Data Loading and Cleaning

In [2]:
# Create generator function from raw data file to be later loaded into a data frame
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

In [3]:
# Iterate through generator and load data, line by line, into new dictionary
def get_reviews(path):
  reviews = []
  for d in parse(path):
    reviews.append(nl.Reviews(d['reviewText'], d['overall']))
  return reviews

In [4]:
# Create new dictionary with all the data
reviews = get_reviews("data/Appliances_5.json.gz")

## Create "Bag of Words" Feature Matrix for Training/Test Data

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
# Use sklearn to split up data into training and test splits, with a seed value to recreate from this state
review_train_set, review_test_set = train_test_split(reviews, train_size=0.6, random_state=9)

In [16]:
# split the features based on X and y (input data and class) for both the training and test sets
review_train_feat_x = [x.review_text for x in review_train_set]
review_train_feat_y = [y.sentiment for y in review_train_set]

review_test_feat_x = [x.review_text for x in review_test_set]
review_test_feat_y = [y.sentiment for y in review_test_set]

In [27]:
# Create new Count Vectorizer object and "fit" the vectorizer vocabulary to the training data
review_vectorizer = CountVectorizer()
review_vectorizer.fit(review_train_feat_x)

# Now that the vocabulary is known, transform training/test feature lists into BoW matrices
# row dim = # of training examples; col dim = size of vocabulary (unique word per column)
review_train_mat_x = review_vectorizer.transform(review_train_feat_x)
review_test_mat_x = review_vectorizer.transform(review_test_feat_x)