## Read in the Data, Select Subset

http://jmcauley.ucsd.edu/data/amazon/links.html

In [3]:
import pandas as pd
import gzip
import random
import numpy as np

import re


In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

def dataSubsets(PDFrames, lower = 1.0, upper = 5.0, num_reviews = 50000, num_test = 10000):
    newDF_upper = PDFrames.loc[PDFrames['overall']>=upper]
    newDF_lower = PDFrames.loc[PDFrames['overall']<=lower]
    
    check1 = newDF_upper['overall'].count()
    #print(check1)
    check2 = newDF_lower['overall'].count()
    #print(check2)

    total_data = num_reviews + num_test
    #print(total_data)
    
    #Check that there is enough data in both postive and negative reviews
    if (check1>=total_data) & (check2>=total_data):
        newDF_upper_sub = newDF_upper.sample(n=total_data, replace=False, random_state=1043)
        newDF_upper_sub['sentiment'] = 1
        newDF_lower_sub = newDF_lower.sample(n=total_data, replace=False, random_state=1043)
        newDF_lower_sub['sentiment'] = 0
        newDF_train = pd.concat([newDF_upper_sub[:num_reviews],newDF_lower_sub[:num_reviews]]).sample(frac=1)
        newDF_test = pd.concat([newDF_upper_sub[-num_test:],newDF_lower_sub[-num_test:]]).sample(frac=1)
    
    #Elif check that there is enough data in positive reviews
    elif(check1>=total_data):
        newDF_lower = PDFrames.loc[PDFrames['overall']<=(lower+1.0)]
        check2 = newDF_lower['overall'].count()
        
        #Check that there is enough data if we make reviews 
        if(check2>=total_data):
            newDF_upper_sub = newDF_upper.sample(n=total_data, replace=False, random_state=1043)
            newDF_upper_sub['sentiment'] = 1
            newDF_lower_sub = newDF_lower.sample(n=total_data, replace=False, random_state=1043)
            newDF_lower_sub['sentiment'] = 0
            newDF_train = pd.concat([newDF_upper_sub[:num_reviews],newDF_lower_sub[:num_reviews]]).sample(frac=1)
            newDF_test = pd.concat([newDF_upper_sub[-num_test:],newDF_lower_sub[-num_test:]]).sample(frac=1)
        else:
            print(f'Not enough data...\nData for Upper = {check1}\nData for Lower = {check2}')
    
    #Elif check that there is enough data in negative reviews
    elif(check2>=total_data):
        newDF_upper = PDFrames.loc[PDFrames['overall']>=(upper-1.0)]
        check1 = newDF_upper['overall'].count()
        
        #See if adding 4.0 gives enough data
        if(check1>=total_data):
            newDF_upper_sub = newDF_upper.sample(n=total_data, replace=False, random_state=1043)
            newDF_upper_sub['sentiment'] = 1
            newDF_lower_sub = newDF_lower.sample(n=total_data, replace=False, random_state=1043)
            newDF_lower_sub['sentiment'] = 0
            newDF_train = pd.concat([newDF_upper_sub[:num_reviews],newDF_lower_sub[:num_reviews]]).sample(frac=1)
            newDF_test = pd.concat([newDF_upper_sub[-num_test:],newDF_lower_sub[-num_test:]]).sample(frac=1)
        else:
            print(f'Not enough data...\nData for Upper = {check1}\nData for Lower = {check2}')
    #Otherwise lets append data (4+, and 2 or 1) to see if there is enough data for our analysis
    else:
        newDF_upper = PDFrames.loc[PDFrames['overall']>=(upper-1.0)]
        check1 = newDF_upper['overall'].count()
        
        newDF_lower = PDFrames.loc[PDFrames['overall']<=(lower+1.0)]
        check2 = newDF_lower['overall'].count()
        
        if (check1>=total_data) & (check2>=total_data):
            newDF_upper_sub = newDF_upper.sample(n=total_data, replace=False, random_state=1043)
            newDF_upper_sub['sentiment'] = 1
            newDF_lower_sub = newDF_lower.sample(n=total_data, replace=False, random_state=1043)
            newDF_lower_sub['sentiment'] = 0
            newDF_train = pd.concat([newDF_upper_sub[:num_reviews],newDF_lower_sub[:num_reviews]]).sample(frac=1)
            newDF_test = pd.concat([newDF_upper_sub[-num_test:],newDF_lower_sub[-num_test:]]).sample(frac=1)
        
        else:
            print(f'Not enough data...\nData for Upper = {check1}\nData for Lower = {check2}')
    
    return newDF_train, newDF_test

def SaveData (trainDirectory, testDirectory, trainDF, testDF):
    export_csv1 = trainDF.to_csv(trainDirectory, index = None, header=True)
    export_csv2 = testDF.to_csv(testDirectory, index = None, header=True)

In [1]:
files = [#'reviews_Video_Games.json.gz', 
#          'reviews_Toys_and_Games.json.gz', 
#          'reviews_Sports_and_Outdoors.json.gz', 
#          'reviews_Movies_and_TV.json.gz',
#          'reviews_Kindle_Store.json.gz', 
#          'reviews_Home_and_Kitchen.json.gz',
#          'reviews_Health_and_Personal_Care.json.gz', 
#          'reviews_Electronics.json.gz',
#          'reviews_Clothing_Shoes_and_Jewelry.json.gz',
#          'reviews_Cell_Phones_and_Accessories.json.gz', 
#          'reviews_CDs_and_Vinyl.json.gz',
#          'reviews_Books.json.gz', 
#         'reviews_Amazon_Instant_Video_5.json.gz']
filesNames = [#'reviews_Video_Games', 
#               'reviews_Toys_and_Games',
#               'reviews_Sports_and_Outdoors', 
#               'reviews_Movies_and_TV',
#               'reviews_Kindle_Store', 
#               'reviews_Home_and_Kitchen',
#               'reviews_Health_and_Personal_Care', 
#               'reviews_Electronics',
#               'reviews_Clothing_Shoes_and_Jewelry',
#               'reviews_Cell_Phones_and_Accessories', 
#               'reviews_CDs_and_Vinyl',
#               'reviews_Books', 
#              'reviews_Amazon_Instant_Video_5']

In [4]:
random.seed(1027)
for fileName, writeName in zip(files, filesNames):
    print('_______________________________________________________')
    print('Working with %s'%(fileName))
    
    df = getDF('../Data/%s'%(fileName))
    print('Completed Parsing %s'%(fileName))
    
    train, test = dataSubsets(df)
    print('Completed Subsetting %s'%(fileName))
    
    SaveData(r'../CleanedData/%s_train.csv'%(writeName), 
         r'../CleanedData/%s_test.csv'%(writeName), 
         train, test)
    print('Wrote train and test data for %s\n\n'%(fileName))

_______________________________________________________
Working with reviews_Amazon_Instant_Video_5.json.gz
Completed Parsing reviews_Amazon_Instant_Video_5.json.gz
Not enough data...
Data for Upper = 29336
Data for Lower = 3603


UnboundLocalError: local variable 'newDF_train' referenced before assignment

In [16]:
files = ['reviews_Video_Games.json.gz', 
         'reviews_Toys_and_Games.json.gz', 
         'reviews_Sports_and_Outdoors.json.gz', 
         'reviews_Movies_and_TV.json.gz',
         'reviews_Kindle_Store.json.gz', 
         'reviews_Home_and_Kitchen.json.gz',
         'reviews_Health_and_Personal_Care.json.gz', 
         'reviews_Electronics.json.gz',
         'reviews_Clothing_Shoes_and_Jewelry.json.gz',
         'reviews_Cell_Phones_and_Accessories.json.gz', 
         'reviews_CDs_and_Vinyl.json.gz',
         'reviews_Books.json.gz']
filesNames = ['reviews_Video_Games', 
              'reviews_Toys_and_Games',
              'reviews_Sports_and_Outdoors', 
              'reviews_Movies_and_TV',
              'reviews_Kindle_Store', 
              'reviews_Home_and_Kitchen',
              'reviews_Health_and_Personal_Care', 
              'reviews_Electronics',
              'reviews_Clothing_Shoes_and_Jewelry',
              'reviews_Cell_Phones_and_Accessories', 
              'reviews_CDs_and_Vinyl',
              'reviews_Books']

def dataFullSets(original, concatData, name):
    currentData = original
    appendData = pd.read_csv('../CleanedData/%s'%(concatData))
    appendData['Product'] = name
    newDF = pd.concat([currentData,appendData])
    return(newDF)

train_data = pd.read_csv('../CleanedData/%s_train.csv'%(filesNames[0]))
train_data['Product'] = filesNames[0]
test_data = pd.read_csv('../CleanedData/%s_test.csv'%(filesNames[0]))
test_data['Product'] = filesNames[0]
                        
for fileName in filesNames[1:]:
    concatName_train = str(fileName+'_train.csv')
    print(concatName_train)
    concatName_test = fileName+'_test.csv'
                        
    train_data = dataFullSets(train_data, concatName_train, fileName)
    test_data = dataFullSets(test_data, concatName_test, fileName)
    
SaveData(r'../CleanedData/train_data.csv', 
         r'../CleanedData/test_data.csv', 
         train_data, test_data)

reviews_Toys_and_Games_train.csv
reviews_Sports_and_Outdoors_train.csv
reviews_Movies_and_TV_train.csv
reviews_Kindle_Store_train.csv


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




reviews_Home_and_Kitchen_train.csv
reviews_Health_and_Personal_Care_train.csv
reviews_Electronics_train.csv
reviews_Clothing_Shoes_and_Jewelry_train.csv
reviews_Cell_Phones_and_Accessories_train.csv
reviews_CDs_and_Vinyl_train.csv
reviews_Books_train.csv


In [17]:
train_data

Unnamed: 0,Product,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,sentiment,summary,unixReviewTime
0,reviews_Video_Games,B0054JGGGY,"[0, 0]",5.0,it is working perfect Best mouse for hand perf...,"02 23, 2014",AAIYP75T4H0L5,Ahmet Yildirim,1,perfect,1393113600
1,reviews_Video_Games,B00008XL0A,"[0, 0]",1.0,If your going to buy a hits game save yourself...,"01 3, 2014",A2141HXMIBTPG4,Drew Danuser,0,Terrible,1388707200
2,reviews_Video_Games,B007ZRZF8I,"[4, 5]",1.0,Pay 2 play? Please! Bad idea.Go look up OpenF...,"06 7, 2012",A3B43K6X4NFXG0,checkedout,0,Never trust OpenFeint. Ever !,1339027200
3,reviews_Video_Games,B00009IM29,"[0, 0]",5.0,I have been buying Madden for the past--oh a l...,"11 7, 2003",AZQ2UJF0H3TWF,Paul,1,Nice Touches,1068163200
4,reviews_Video_Games,B0009VXAM0,"[14, 104]",5.0,This is the best system ever!!! Buy it asap. ...,"11 17, 2006",A39XNRYKE61XZY,"Nate ""Nate""",1,ROCKIN' FUN,1163721600
5,reviews_Video_Games,B00CX9L30W,"[1, 12]",1.0,SE is doing really bad job at maintaining serv...,"09 1, 2013",A3L1I85NHMBHH0,JxK718,0,Might be great game but very bad at maintaining,1377993600
6,reviews_Video_Games,B000B43OY4,"[6, 11]",1.0,I bought the Xbox 360 premium system 3 weekend...,"10 18, 2006",A1BCY2DJGNBUQM,Freakin Sledge,0,Totally not what I expected,1161129600
7,reviews_Video_Games,B004FYEZMQ,"[8, 28]",5.0,I understand there's a controversy on the endi...,"06 6, 2012",A39WJNB2K0L73B,Amazon Customer,1,Massively Effective,1338940800
8,reviews_Video_Games,B007FTE2VW,"[2, 2]",1.0,"So far, all that is able to be done with this ...","03 6, 2013",A3GHELK2AA6CLE,,0,Do not purchase this game!,1362528000
9,reviews_Video_Games,B003ZHQH02,"[16, 18]",5.0,As soon as i saw this game i had to get it. Af...,"11 20, 2010",A2UT0AFICK4XIS,A. Bettencourt,1,Mario Vs DK has still got class!,1290211200


In [74]:
test

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment
2236870,A16O9V0N14TQXL,B00HR8JMZM,Jasper,"[0, 0]",My son loved it!,5.0,Five Stars,1405123200,"07 12, 2014",1
1192676,A38B2QF2M0BX8G,B003JSSAHU,sam,"[0, 0]","These birds were very pretty, but it was hard ...",1.0,little warble in pretty birds,1310860800,"07 17, 2011",0
2229673,A2P6ZQRMHTM5TR,B00GZ1GUSY,Aaron Edwards,"[0, 0]","I owned the PS3 version for a little while, tr...",5.0,Great Game,1395705600,"03 25, 2014",1
1039177,A2CIJ9CZ5YUB0Q,B002V1H0WW,Kimberly,"[2, 2]",This float is durable and very roomy for two p...,5.0,Durable and roomy,1305504000,"05 16, 2011",1
2185422,A1113QZDZX7ADU,B00F3TKLRA,Hanna Mh,"[10, 11]",I just received these bands and they smell hor...,1.0,Horrible Discusting Smell!,1391385600,"02 3, 2014",0
145153,ABJY7LAG572T8,B00007L12O,A. Coleman,"[7, 33]","I ordered this tool set for my grandson, age 2...",1.0,Kids Garden Tool Set,1210723200,"05 14, 2008",0
2228585,AB9KWWWADOKG1,B00GWTGFNO,Julio S.,"[0, 0]",@Broadway Lumber Supplies we value every singl...,5.0,Beautiful material worth every penney,1396569600,"04 4, 2014",1
2146802,A38CJRYE1A9WNR,B00DQ99NT6,Blake Herman,"[0, 2]",Ours lasted about 10 minutes and then died. W...,1.0,Works great when it works,1388188800,"12 28, 2013",0
484810,A34H8TMO9Z0DNL,B000JNE658,Victoria C. El-Zarif,"[0, 1]",The purchase was smooth and the item posted pr...,5.0,Fun!!,1309478400,"07 1, 2011",1
1838367,A3PDW0BHDVII1S,B008C2EY6O,veronica,"[0, 0]",I got this to use as a little cake topper for ...,1.0,Don t waste your money,1404086400,"06 30, 2014",0


## Loading Zhang et al. Yelp Polarity Data

In [9]:
yelp_train = pd.read_csv('../../Yelp-Zhang-train.csv', header = None)
yelp_train.columns = ['sentiment','reviewText']
yelp_train['sentiment'] = np.where(yelp_train['sentiment']==2,1,0)



yelp_test = pd.read_csv('../../Yelp-Zhang-test.csv', header = None)
yelp_test.columns = ['sentiment','reviewText']
yelp_test['sentiment'] = np.where(yelp_test['sentiment']==2,1,0)

In [10]:
yelp_train.head(3)

Unnamed: 0,sentiment,reviewText
0,0,"Unfortunately, the frustration of being Dr. Go..."
1,1,Been going to Dr. Goldberg for over 10 years. ...
2,0,I don't know what Dr. Goldberg was like before...


In [11]:
yelp_test.head(3)

Unnamed: 0,sentiment,reviewText
0,1,"Contrary to other reviews, I have zero complai..."
1,0,Last summer I had an appointment to get new ti...
2,1,"Friendly staff, same starbucks fair you get an..."


In [12]:
SaveData(r'../../yelp_zhang_pol_train.csv', 
         r'../../yelp_zhang_pol_test.csv', 
         yelp_train, yelp_test)

In [8]:
Amazon_Zhang_Polarity_test = pd.read_csv('../../Amazon-Zhang-Polarity-test.csv', header = None)

In [9]:
Amazon_Zhang_Polarity_test.head()

Unnamed: 0,0,1,2
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [10]:
Amazon_Zhang_Polarity_test.columns = ['sentiment','reviewTitle','reviewText']
Amazon_Zhang_Polarity_test['sentiment'] = np.where(Amazon_Zhang_Polarity_test['sentiment']==2,1,0)

In [11]:
Amazon_Zhang_Polarity_test.to_csv(r'../../Amazon_Zhang_Polarity_test.csv', index = None, header=True)

In [12]:
max_length = 200

def loadData(loadDat = True, #test_data = test_data, train_data = train_data, 
             #yelp_zhang_train = yelp_zhang_train, yelp_zhang_test = yelp_zhang_test, 
             amazon_zhang_test = Amazon_Zhang_Polarity_test):
    
    if loadDat == True:
#         train_data_name = '../../train_data_cleaned.pkl'
#         train_data = pd.read_pickle(train_data_name)
        
#         test_data_name = '../../train_data_cleaned.pkl'
#         test_data = pd.read_pickle(test_data_name)
        
#         play_data_name = '../../play_data_cleaned.pkl'
#         play_data = pd.read_pickle(play_data_name)
        
#         twitter_reviews_name = '../../twitter_reviews_cleaned.pkl'
#         twitter_reviews = pd.read_pickle(twitter_reviews_name)
        
#         imdb_test_name = '../../imdb_test_cleaned.pkl'
#         imdb_test = pd.read_pickle(imdb_test_name)
#         imdb_train_name = '../../imdb_train_cleaned.pkl'
#         imdb_train = pd.read_pickle(imdb_train_name)
        
#         yelp_reviews_name = '../../yelp_reviews_cleaned.pkl'
#         yelp_reviews = pd.read_pickle(yelp_reviews_name)
        
#         yelp_zhang_train_name = '../../yelp_zhang_pol_train_cleaned.pkl'
#         yelp_zhang_train = pd.read_pickle(yelp_zhang_train_name)
#         yelp_zhang_test_name = '../../yelp_zhang_pol_test_cleaned.pkl'
#         yelp_zhang_test = pd.read_pickle(yelp_zhang_test_name)
        
#         sst_test_name = '../../sst_test.pkl'
#         sst_test = pd.read_pickle(sst_test_name)
        
        amazon_zhang_test_name = '../../amazon_zhang_pol_test_cleaned.pkl'
        amazon_zhang_test = pd.read_pickle(amazon_zhang_test_name)
        
#         print('Successfully opened pickled data')
        
#         return train_data, test_data, play_data, twitter_reviews, imdb_test, imdb_train, yelp_reviews, yelp_zhang_train, yelp_zhang_test, sst_test, amazon_zhang_test
        return amazon_zhang_test
    else:

        #play_data = test_data[:10]

        def decontracted(phrase):
            phrase = re.sub(r"won\'t", "will not", phrase)
            phrase = re.sub(r"can\'t", "can not", phrase)
            phrase = re.sub(r"n\'t", " not", phrase)
            phrase = re.sub(r"\'re", " are", phrase)
            phrase = re.sub(r"\'s", " is", phrase)
            phrase = re.sub(r"\'d", " would", phrase)
            phrase = re.sub(r"\'ll", " will", phrase)
            phrase = re.sub(r"\'t", " not", phrase)
            phrase = re.sub(r"\'ve", " have", phrase)
            phrase = re.sub(r"\'m", " am", phrase)
            return phrase

        def prepReview(data):
            data_prep = str(data['reviewText'])
            data_prep = re.sub("[!?]", ".", data_prep)
            data_prep = re.sub("[^a-zA-Z']", " ", data_prep).lower()
            data_prep = re.sub("[\\s]+", " ", data_prep)
            data_prep = decontracted(data_prep)
            return data_prep

        #function to process review text and split into words
        def splitWords(data):
            return prepReview(data).split()[:max_length]

        #function to process review text and split into sentences
        def splitSentences(data):
            interim = prepReview(data).split()
            reviewTrunc = interim[:max_length]
            return ' '.join(reviewTrunc)


        #list of words from review into column
#         sentences = play_data.apply(splitWords, axis=1)
#         play_data.insert(loc = 11, column = 'sentenceWords', value = sentences)

#         sentences = train_data.apply(splitWords, axis=1)
#         train_data.insert(loc = 11,column = 'sentenceWords', value = sentences)

#         sentences = test_data.apply(splitWords, axis=1)
#         test_data.insert(loc = 11,column = 'sentenceWords', value = sentences)

#         #IMDB additions
#         sentences = imdb_test.apply(splitWords, axis=1)
#         imdb_test.insert(loc = 6,column = 'sentenceWords', value = sentences)
        
#         sentences = imdb_train.apply(splitWords, axis=1)
#         imdb_train.insert(loc = 6,column = 'sentenceWords', value = sentences)
        
#         sentences = twitter_reviews.apply(splitWords, axis=1)
#         twitter_reviews.insert(loc = 2,column = 'sentenceWords', value = sentences)
        
#         sentences = yelp_reviews.apply(splitWords, axis=1)
#         yelp_reviews.insert(loc = 2,column = 'sentenceWords', value = sentences)
        
#         sentences = yelp_zhang_train.apply(splitWords, axis=1)
#         yelp_zhang_train.insert(loc = 2,column = 'sentenceWords', value = sentences)
        
#         sentences = yelp_zhang_test.apply(splitWords, axis=1)
#         yelp_zhang_test.insert(loc = 2,column = 'sentenceWords', value = sentences)
        
#         sentences = sst_test.apply(splitWords, axis=1)
#         sst_test.insert(loc = 2,column = 'sentenceWords', value = sentences)
        
        sentences = amazon_zhang_test.apply(splitWords, axis=1)
        amazon_zhang_test.insert(loc = 2,column = 'sentenceWords', value = sentences)
        
        print('Finished working through the words for each sentences\nOn to the sentences..')

        #list of sentences from review into column
#         sentences_split = play_data.apply(splitSentences, axis=1)
#         play_data.insert(loc = 12, column = 'sentences', value = sentences_split)

#         sentences_split = train_data.apply(splitSentences, axis=1)
#         train_data.insert(loc = 12,column = 'sentences', value = sentences_split)

#         sentences_split = test_data.apply(splitSentences, axis=1)
#         test_data.insert(loc = 12,column = 'sentences', value = sentences_split)
        
#         #IMDB additions
#         sentences_split = imdb_test.apply(splitSentences, axis=1)
#         imdb_test.insert(loc = 7,column = 'sentences', value = sentences_split)

#         sentences_split = imdb_train.apply(splitSentences, axis=1)
#         imdb_train.insert(loc = 7,column = 'sentences', value = sentences_split)
        
#         sentences_split = twitter_reviews.apply(splitSentences, axis=1)
#         twitter_reviews.insert(loc = 3,column = 'sentences', value = sentences_split)
        
#         sentences_split = yelp_reviews.apply(splitSentences, axis=1)
#         yelp_reviews.insert(loc = 3,column = 'sentences', value = sentences_split)
        
#         sentences_split = yelp_zhang_train.apply(splitSentences, axis=1)
#         yelp_zhang_train.insert(loc = 2,column = 'sentences', value = sentences_split)
        
#         sentences_split = yelp_zhang_test.apply(splitSentences, axis=1)
#         yelp_zhang_test.insert(loc = 2,column = 'sentences', value = sentences_split)
        
#         sentences_split = sst_test.apply(splitSentences, axis=1)
#         sst_test.insert(loc = 3,column = 'sentences', value = sentences_split)
        
        sentences_split = amazon_zhang_test.apply(splitSentences, axis=1)
        amazon_zhang_test.insert(loc = 2,column = 'sentences', value = sentences_split)
        
        print('Finished sentences...\nNow moving on to pickling the data')

#         train_data_name = '../../train_data_cleaned.pkl'
#         train_data.to_pickle(train_data_name)
        
#         test_data_name = '../../test_data_cleaned.pkl'
#         test_data.to_pickle(test_data_name)
        
#         play_data_name = '../../play_data_cleaned.pkl'
#         play_data.to_pickle(play_data_name)
        
#         twitter_reviews_name = '../../twitter_reviews_cleaned.pkl'
#         twitter_reviews.to_pickle(twitter_reviews_name)
        
#         imdb_test_name = '../../imdb_test_cleaned.pkl'
#         imdb_test.to_pickle(imdb_test_name)
#         imdb_train_name = '../../imdb_train_cleaned.pkl'
#         imdb_train.to_pickle(imdb_train_name)
        
#         yelp_reviews_name = '../../yelp_reviews_cleaned.pkl'
#         yelp_reviews.to_pickle(yelp_reviews_name)

#         yelp_zhang_train_name = '../../yelp_zhang_pol_train_cleaned.pkl'
#         yelp_zhang_train.to_pickle(yelp_zhang_train_name)
#         yelp_zhang_test_name = '../../yelp_zhang_pol_test_cleaned.pkl'
#         yelp_zhang_test.to_pickle(yelp_zhang_test_name)
        
#         sst_test_name = '../../sst_test.pkl'
#         sst_test.to_pickle(sst_test_name)
        
        amazon_zhang_test_name = '../../amazon_zhang_pol_test_cleaned.pkl'
        amazon_zhang_test.to_pickle(amazon_zhang_test_name)
        
        print('Finished pickling for future use.')
        
        return amazon_zhang_test
        
amazon_zhang_test = loadData(loadDat = True)
        
#         return train_data, test_data, play_data, twitter_reviews, imdb_test, imdb_train, yelp_reviews, yelp_zhang_train, yelp_zhang_test, sst_test
        
# train_data, test_data, play_data, twitter_reviews, imdb_test, imdb_train, yelp_reviews, yelp_zhang_train, yelp_zhang_test, sst_test = loadData(loadDat = True)

In [13]:
amazon_zhang_test

Unnamed: 0,sentiment,reviewTitle,sentences,sentenceWords,reviewText
0,1,Great CD,my lovely pat has one of the great voices of h...,"[my, lovely, pat, has, one, of, the, great, vo...",My lovely Pat has one of the GREAT voices of h...
1,1,One of the best game music soundtracks - for a...,despite the fact that i have only played a sma...,"[despite, the, fact, that, i, have, only, play...",Despite the fact that I have only played a sma...
2,0,Batteries died within a year ...,i bought this charger in jul and it worked ok ...,"[i, bought, this, charger, in, jul, and, it, w...",I bought this charger in Jul 2003 and it worke...
3,1,"works fine, but Maha Energy is better",check out maha energy is website their powerex...,"[check, out, maha, energy, is, website, their,...",Check out Maha Energy's website. Their Powerex...
4,1,Great for the non-audiophile,reviewed quite a bit of the combo players and ...,"[reviewed, quite, a, bit, of, the, combo, play...",Reviewed quite a bit of the combo players and ...
5,0,DVD Player crapped out after one year,i also began having the incorrect disc problem...,"[i, also, began, having, the, incorrect, disc,...",I also began having the incorrect disc problem...
6,0,Incorrect Disc,i love the style of this but after a couple ye...,"[i, love, the, style, of, this, but, after, a,...","I love the style of this, but after a couple y..."
7,0,DVD menu select problems,i cannot scroll through a dvd menu that is set...,"[i, cannot, scroll, through, a, dvd, menu, tha...",I cannot scroll through a DVD menu that is set...
8,1,Unique Weird Orientalia from the 1930's,exotic tales of the orient from the is dr shen...,"[exotic, tales, of, the, orient, from, the, is...","Exotic tales of the Orient from the 1930's. ""D..."
9,0,"Not an ""ultimate guide""",firstly i enjoyed the format and tone of the b...,"[firstly, i, enjoyed, the, format, and, tone, ...","Firstly,I enjoyed the format and tone of the b..."


In [14]:
max_sents = 10
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

def addSents(loadDat = True, #stop_words=stop_words, test_data = test_data, train_data = train_data, 
             #yelp_zhang_train = yelp_zhang_train, yelp_zhang_test = yelp_zhang_test,
             #imdb_test = imdb_test, imdb_train = imdb_train, twitter_reviews = twitter_reviews,
             #yelp_reviews = yelp_reviews, sst_test = sst_test,
             amazon_zhang_test = amazon_zhang_test):
    
    if loadDat == True:
#         train_data_name = '../../train_data_cleaned_sents.pkl'
#         train_data = pd.read_pickle(train_data_name)
        
#         test_data_name = '../../train_data_cleaned_sents.pkl'
#         test_data = pd.read_pickle(test_data_name)
        
#         play_data_name = '../../play_data_cleaned_sents.pkl'
#         play_data = pd.read_pickle(play_data_name)
        
#         twitter_reviews_name = '../../twitter_reviews_cleaned_sents.pkl'
#         twitter_reviews = pd.read_pickle(twitter_reviews_name)
        
#         imdb_test_name = '../../imdb_test_cleaned_sents.pkl'
#         imdb_test = pd.read_pickle(imdb_test_name)
#         imdb_train_name = '../../imdb_train_cleaned_sents.pkl'
#         imdb_train = pd.read_pickle(imdb_train_name)
        
#         yelp_reviews_name = '../../yelp_reviews_cleaned_sents.pkl'
#         yelp_reviews = pd.read_pickle(yelp_reviews_name)
        
#         yelp_zhang_train_name = '../../yelp_zhang_pol_train_cleaned_sents.pkl'
#         yelp_zhang_train = pd.read_pickle(yelp_zhang_train_name)
#         yelp_zhang_test_name = '../../yelp_zhang_pol_test_cleaned_sents.pkl'
#         yelp_zhang_test = pd.read_pickle(yelp_zhang_test_name)
        
#         sst_test_name = '../../sst_test_sents.pkl'
#         sst_test = pd.read_pickle(sst_test_name)
        
        amazon_zhang_test_name = '../../amazon_zhang_test_pol_test_cleaned_sents.pkl'
        amazon_zhang_test = pd.read_pickle(amazon_zhang_test_name)
        
        print('Successfully opened pickled data')
        
        return train_data, test_data, play_data, twitter_reviews, imdb_test, imdb_train, yelp_reviews, yelp_zhang_train, yelp_zhang_test, sst_test
    
    else:

        #play_data = test_data[:10]

        def decontracted(phrase):
            phrase = re.sub(r"won\'t", "will not", phrase)
            phrase = re.sub(r"can\'t", "can not", phrase)
            phrase = re.sub(r"n\'t", " not", phrase)
            phrase = re.sub(r"\'re", " are", phrase)
            phrase = re.sub(r"\'s", " is", phrase)
            phrase = re.sub(r"\'d", " would", phrase)
            phrase = re.sub(r"\'ll", " will", phrase)
            phrase = re.sub(r"\'t", " not", phrase)
            phrase = re.sub(r"\'ve", " have", phrase)
            phrase = re.sub(r"\'m", " am", phrase)
            return phrase

        def prepReview(data):
            data_prep = str(data['reviewText'])
            data_prep = re.sub("[!?]", ".", data_prep)
            data_prep = re.sub("[^a-zA-Z'.]", " ", data_prep).lower()
            data_prep = re.sub("\.+", " . ", data_prep)
            data_prep = re.sub("\s+", " ", data_prep)
            data_prep = decontracted(data_prep)
            return data_prep

        #function to process review text and split into words
        def splitWords(data):
            
            words = []
            for word in prepReview(data).split():
                if word not in stop_words:
                    if (word != '.' )| (word != ' .'):
                        words.append(word)
            return words[:max_length]

        #function to process review text and split into sentences
        def splitSentences(data):
            interim = prepReview(data).split('.')
            return interim


#         #list of words from review into column
#         sentences = play_data.apply(splitWords, axis=1)
#         play_data.insert(loc = 11, column = 'sentenceWords_Stops', value = sentences)

#         sentences = train_data.apply(splitWords, axis=1)
#         train_data.insert(loc = 11,column = 'sentenceWords_Stops', value = sentences)

#         sentences = test_data.apply(splitWords, axis=1)
#         test_data.insert(loc = 11,column = 'sentenceWords_Stops', value = sentences)

#         #IMDB additions
#         sentences = imdb_test.apply(splitWords, axis=1)
#         imdb_test.insert(loc = 6,column = 'sentenceWords_Stops', value = sentences)
        
#         sentences = imdb_train.apply(splitWords, axis=1)
#         imdb_train.insert(loc = 6,column = 'sentenceWords_Stops', value = sentences)
        
#         sentences = twitter_reviews.apply(splitWords, axis=1)
#         twitter_reviews.insert(loc = 2,column = 'sentenceWords_Stops', value = sentences)
        
#         sentences = yelp_reviews.apply(splitWords, axis=1)
#         yelp_reviews.insert(loc = 2,column = 'sentenceWords_Stops', value = sentences)
        
#         sentences = yelp_zhang_train.apply(splitWords, axis=1)
#         yelp_zhang_train.insert(loc = 2,column = 'sentenceWords_Stops', value = sentences)
        
#         sentences = yelp_zhang_test.apply(splitWords, axis=1)
#         yelp_zhang_test.insert(loc = 2,column = 'sentenceWords_Stops', value = sentences)
        
        
        sentences = amazon_zhang_test.apply(splitWords, axis=1)
        amazon_zhang_test.insert(loc = 2,column = 'sentenceWords_Stops', value = sentences)
        
        print('Finished working through the words for each sentences\nOn to the sentences..')

#         #list of sentences from review into column
#         sentences_split = play_data.apply(splitSentences, axis=1)
#         play_data.insert(loc = 12, column = 'sentences_indiv', value = sentences_split)

#         sentences_split = train_data.apply(splitSentences, axis=1)
#         train_data.insert(loc = 12,column = 'sentences_indiv', value = sentences_split)

#         sentences_split = test_data.apply(splitSentences, axis=1)
#         test_data.insert(loc = 12,column = 'sentences_indiv', value = sentences_split)
        
#         #IMDB additions
#         sentences_split = imdb_test.apply(splitSentences, axis=1)
#         imdb_test.insert(loc = 7,column = 'sentences_indiv', value = sentences_split)

#         sentences_split = imdb_train.apply(splitSentences, axis=1)
#         imdb_train.insert(loc = 7,column = 'sentences_indiv', value = sentences_split)
        
#         sentences_split = twitter_reviews.apply(splitSentences, axis=1)
#         twitter_reviews.insert(loc = 3,column = 'sentences_indiv', value = sentences_split)
        
#         sentences_split = yelp_reviews.apply(splitSentences, axis=1)
#         yelp_reviews.insert(loc = 3,column = 'sentences_indiv', value = sentences_split)
        
#         sentences_split = yelp_zhang_train.apply(splitSentences, axis=1)
#         yelp_zhang_train.insert(loc = 2,column = 'sentences_indiv', value = sentences_split)
        
        sentences_split = amazon_zhang_test.apply(splitSentences, axis=1)
        amazon_zhang_test.insert(loc = 2,column = 'sentences_indiv', value = sentences_split)
        
        print('Finished sentences...\nNow moving on to pickling the data')

#         train_data_name = '../../train_data_cleaned_sents.pkl'
#         train_data.to_pickle(train_data_name)
        
#         test_data_name = '../../test_data_cleaned_sents.pkl'
#         test_data.to_pickle(test_data_name)
        
#         play_data_name = '../../play_data_cleaned_sents.pkl'
#         play_data.to_pickle(play_data_name)
        
#         twitter_reviews_name = '../../twitter_reviews_cleaned_sents.pkl'
#         twitter_reviews.to_pickle(twitter_reviews_name)
        
#         imdb_test_name = '../../imdb_test_cleaned_sents.pkl'
#         imdb_test.to_pickle(imdb_test_name)
#         imdb_train_name = '../../imdb_train_cleaned_sents.pkl'
#         imdb_train.to_pickle(imdb_train_name)
        
#         yelp_reviews_name = '../../yelp_reviews_cleaned_sents.pkl'
#         yelp_reviews.to_pickle(yelp_reviews_name)

#         yelp_zhang_train_name = '../../yelp_zhang_pol_train_cleaned_sents.pkl'
#         yelp_zhang_train.to_pickle(yelp_zhang_train_name)
#         yelp_zhang_test_name = '../../yelp_zhang_pol_test_cleaned_sents.pkl'
#         yelp_zhang_test.to_pickle(yelp_zhang_test_name)
        
#         sst_test_name = '../../sst_test_sents.pkl'
#         sst_test.to_pickle(sst_test_name)
        
        amazon_zhang_test_name = '../../amazon_zhang_test_pol_test_cleaned_sents.pkl'
        amazon_zhang_test.to_pickle(amazon_zhang_test_name)
        
        print('Finished pickling for future use.')
        
#         return train_data, test_data, play_data, twitter_reviews, imdb_test, imdb_train, yelp_reviews, yelp_zhang_train, yelp_zhang_test, sst_test
        
# train_data, test_data, play_data, twitter_reviews, imdb_test, imdb_train, yelp_reviews, yelp_zhang_train, yelp_zhang_test, sst_test = addSents(loadDat = True)

        return amazon_zhang_test
        
amazon_zhang_test = addSents(loadDat = True)


NameError: name 'yelp_zhang_test_name' is not defined

In [2]:
amazon_zhang_test

NameError: name 'amazon_zhang_test' is not defined