In [169]:
from __future__ import division, unicode_literals 
import pandas as pd
from urllib import parse, request
import json
from json import loads
import psycopg2 
from sqlalchemy import create_engine, text
import pysolr
import math
from textblob import TextBlob as tb

In [344]:
# AsterixDBConnection
class QueryResponse:
    def __init__(self, raw_response):
        self._json = loads(raw_response)

        self.requestID = self._json['requestID'] if 'requestID' in self._json else None
        self.clientContextID = self._json['clientContextID'] if 'clientContextID' in self._json else None
        self.signature = self._json['signature'] if 'signature' in self._json else None
        self.results = self._json['results'] if 'results' in self. _json else None
        self.metrics = self._json['metrics'] if 'metrics' in self._json else None

class AsterixConnection:
    def __init__(self, server = 'http://45.79.91.219', port = 19002):
        self._server = server
        self._port = port
        self._url_base = self._server +':'+ str(port)

    def query(self, statement, pretty=False, client_context_id=None):
        endpoint = '/query/service'

        url = self._url_base + endpoint

        payload = {
            'statement': statement,
            'pretty': pretty
        }

        if client_context_id:
            payload['client_context_id'] = client_context_id

        data = parse.urlencode(payload).encode("utf-8")
        req = request.Request(url, data)
        response = request.urlopen(req).read()

        return QueryResponse(response)

In [345]:
# Running query to bring the nodeids for all categories # this will be parameter based for API
if __name__ == '__main__':
    asterix_conn = AsterixConnection()
    response = asterix_conn.query('''
        use bookstore_dp;
        SELECT * from ClassificationInfo;''')    

In [346]:
#constructing Dataframe from Asterix data
df=pd.DataFrame(response.results)
df.head()

Unnamed: 0,ClassificationInfo
0,"{'classification': 'Arts & Photography', 'node..."
1,"{'classification': 'Architecture', 'nodeID': 1..."
2,"{'classification': 'Architectural Buildings', ..."
3,"{'classification': 'General', 'nodeID': 720870..."
4,"{'classification': 'Landmarks & Monuments', 'n..."


In [356]:
#collecting nodeids for given category
node_id=[]
for i in range(df.shape[0]):
    a=df.ClassificationInfo[i]['nodeID']
    node_id.append(a)    

In [357]:
node_id=[str(x) for x in node_id]

In [358]:
node_id=set(node_id) # Set removes duplicates

In [360]:
#connecting to postgres
conn_string = "host='45.79.91.219' dbname='MyBookStore' user='postgres' password=''"
print ("Connecting to database\n	->%s"% (conn_string))
conn = psycopg2.connect(conn_string)
cursor = conn.cursor()
print ("Connected!\n")
 


Connecting to database
	->host='45.79.91.219' dbname='MyBookStore' user='postgres' password=''
Connected!



In [362]:
# execute our Query finding all the sales of products; This will be parameterized for API with month as input data
sql="SELECT DISTINCT o.productid, o.billdate, o.numunits, p.asin,p.nodeid\
    FROM orderlines o, products p\
    WHERE o.productid=p.productid;"
cursor.execute(sql)

# retrieve the records from the database
records = cursor.fetchall()

In [363]:
#Find Column Names for dataframe headers
colnames = [desc[0] for desc in cursor.description]

In [364]:
#constructing dataframe
df1=pd.DataFrame(records, columns=colnames)

In [365]:
df1.tail()

Unnamed: 0,productid,billdate,numunits,asin,nodeid
96327,14039,2016-09-14,0,0007426224,4468
96328,14039,2016-09-14,1,0007426224,4468
96329,14039,2016-09-15,0,0007426224,4468
96330,14039,2016-09-16,0,0007426224,4468
96331,14040,2016-09-17,0,000742695X,8417576011


In [366]:
#construct dataframe only for existing categories i.e. only for nodes in node_id SET constructed above
df2=df1[df1['nodeid'].isin(node_id)]

In [367]:
df2.shape

(96332, 5)

In [368]:
#sales grouped by billdate and units sold per day are summed together
df3=df2.groupby(['billdate','productid','asin','nodeid'],as_index=False)['numunits'].sum()

In [369]:
print(df3.shape)
df3.tail()

(87334, 5)


Unnamed: 0,billdate,productid,asin,nodeid,numunits
87329,2016-09-21,13951,7376103,173578,0
87330,2016-09-21,13973,7388160,5041,1
87331,2016-09-21,13993,7410956,4870,1
87332,2016-09-21,13999,7412622,6343223011,21
87333,2016-09-21,14028,7423632,271590011,1


In [385]:
#Wrapper from Query Team to connect to Solr
def solrWrap(core,params):
    
    query_string='http://45.79.91.219:8983/solr/'+core+'/select?' # connecting to our linode server
    for key in params:
        query_string=query_string+key+'='+params[key]+'&'
        print (query_string)
    solrcon = pysolr.Solr(query_string, timeout=10)
    results = solrcon.search('*:*')
    docs=pd.DataFrame(results.docs)
    return docs

In [386]:
#bringing all the data rows for our sentiment polarity analysis; number of reviews will passed as parameter for API
d3 = {'q': '*:*','rows':'77165' } 
d_res3=solrWrap('bookstore',d3)
d_res3.tail()

http://45.79.91.219:8983/solr/bookstore/select?q=*:*&
http://45.79.91.219:8983/solr/bookstore/select?q=*:*&rows=77165&


Unnamed: 0,_version_,asin,asin_str,id,reviewText,reviewText_str,reviewerID,reviewerID_str
77160,1582297513738633219,[0007426224],[0007426224],3608d8ad-e0dc-42d1-9070-0b5937279a22,[John stack s books are an excellent and absor...,[John stack s books are an excellent and absor...,[A3681XFZ9T6BRO],[A3681XFZ9T6BRO]
77161,1582297513738633220,[0007426224],[0007426224],4fb1a8ff-860b-48bd-aeeb-5b0364bfa987,"[Great series, set in a very interesting time ...","[Great series, set in a very interesting time ...",[A3DAORPYFXTZMS],[A3DAORPYFXTZMS]
77162,1582297513738633221,[0007426224],[0007426224],a9f3122b-6bdc-4231-bf00-cc35afb844e8,[I really like John Stack and this series but ...,[I really like John Stack and this series but ...,[A1VKROSQK55S18],[A1VKROSQK55S18]
77163,1582297513738633222,[0007426224],[0007426224],4a866465-047a-4d3b-8955-edf3e101c0b2,[This is a good series. You can read each book...,[This is a good series. You can read each book...,[A2S36A7V2RWJO6],[A2S36A7V2RWJO6]
77164,1582297513738633223,[000742695X],[000742695X],abf98785-4af2-41d8-94b4-7ddb57867d3c,[This is an incredibly useful book and one tha...,[This is an incredibly useful book and one tha...,[A3N4EDXJTN8IP8],[A3N4EDXJTN8IP8]


In [387]:
# calculating sentiment polarity, the values ranges from -1 to 1
polarity_measure=[]
for i in range(d_res3.shape[0]):
    str1 = str(d_res3.reviewText[i])
    blob=tb(str1)
    polarity_measure.append(blob.sentiment.polarity)

In [391]:
d_res3.head()

Unnamed: 0,_version_,asin,asin_str,id,reviewText,reviewText_str,reviewerID,reviewerID_str,Sentiment_polarity
0,1582297498213416960,[000047715X],[000047715X],0eb6691f-ab54-44c6-9b93-ac5db453640f,[Very thorough review of MKSAP and a great com...,[Very thorough review of MKSAP and a great com...,[A2CAVTNQA2Y3IJ],[A2CAVTNQA2Y3IJ],0.177273
1,1582297498339246080,[0000000116],[0000000116],82634ca3-81a4-468d-bf25-1e055c8a974d,[Interesting Grisham tale of a lawyer that tak...,[Interesting Grisham tale of a lawyer that tak...,[AH2L9G3DQHHAJ],[AH2L9G3DQHHAJ],0.25
2,1582297498346586112,[0000000116],[0000000116],7851bb38-4811-45ce-a4e0-199e7f1f3c95,[The thumbnail is a shirt. The product shown ...,[The thumbnail is a shirt. The product shown ...,[A2IIIDRK3PRRZY],[A2IIIDRK3PRRZY],-0.4
3,1582297498348683264,[0000000868],[0000000868],77c7f516-10b9-4bc3-a7c3-abecf6be3710,[I'll be honest. I work for a large online ret...,[I'll be honest. I work for a large online ret...,[A1TADCM7YWPQ8M],[A1TADCM7YWPQ8M],0.07076
4,1582297498358120448,[0000013714],[0000013714],60aa1298-4ddb-4439-aae0-18f59c02e94d,[It had all the songs I wanted but I had order...,[It had all the songs I wanted but I had order...,[AWGH7V0BDOJKB],[AWGH7V0BDOJKB],0.071429


In [388]:
se = pd.Series(polarity_measure)
d_res3['Sentiment_polarity'] = se.values

In [404]:
#converting list values to string for groupby operation and formatting string for uniformity to match
d_res3['asin']=d_res3['asin_str'].apply(lambda x:''+str(x)[2:-2]+'') 

In [405]:
#average sentiment polarity per product
df_sentiment=d_res3.groupby(['asin'],as_index=False)['Sentiment_polarity'].mean()

In [408]:
df_sentiment.head()

Unnamed: 0,asin,Sentiment_polarity
0,116,-0.075
1,868,0.07076
2,13714,0.409443
3,15393,0.185
4,29831,0.048483


In [409]:
df_sentiment.shape

(4040, 2)

In [420]:
#Matching sales data with reviews data
result=pd.merge(df3, df_sentiment, on='asin', how='inner')

In [421]:
result.shape

(87334, 6)

In [422]:
result

Unnamed: 0,billdate,productid,asin,nodeid,numunits,Sentiment_polarity
0,2009-10-23,11070,0005064309,3185,1,0.390000
1,2009-10-28,11070,0005064309,3185,1,0.390000
2,2009-11-05,11070,0005064309,3185,1,0.390000
3,2009-11-06,11070,0005064309,3185,6,0.390000
4,2009-11-07,11070,0005064309,3185,3,0.390000
5,2009-11-10,11070,0005064309,3185,1,0.390000
6,2009-11-11,11070,0005064309,3185,10,0.390000
7,2009-11-12,11070,0005064309,3185,2,0.390000
8,2009-11-17,11070,0005064309,3185,6,0.390000
9,2009-11-18,11070,0005064309,3185,5,0.390000
