In [266]:
#Purpose: Class Project that covers the material learned in the "Python & Advanced Modeling" class, 
#Following CRISP-DM methodology for data science projects:  1) Business Understanding, 2) Data Understanding/Preparation,
#3) Model Building & Evaluation,  4) Deployment 
import requests  #GET/POST/PUT API requests
from contextlib import closing  #utilities for common tasks involving the "with" statement.
from bs4 import BeautifulSoup #BeautifulSoup4 - HTML Web Scraping #Scrapy
import csv #write stock information to csv file
import json #create json payload and/or parse json response set from API
import numpy as np
import pandas as pd
from random import * #random number generation

In [267]:
#CRISP-DM Step 2a) Obtain Data - Use your favorite Pandas library to "wrangle" your data for analysis!
#API Get Request
def API_get(view_url):
    #set url for json request, then obtain json response/payload
    #ex: view_url = 'https://jsonplaceholder.typicode.com/todos'
    myResponse = requests.get(view_url)
    #print (myResponse.status_code)
    # For successful API call, response code will be 200 (OK)
    if(myResponse.ok):
        # Loads (Load String) takes a Json file and converts into python data structure 
        # (dict or list, depending on JSON structure and number of records returned
        jData = myResponse.content 
        # Loading the response data into a dict variable
        jData = json.loads(jData) 
        print("The API Get Request Was Successful")
        print("\n")
        return jData
        
    else:
        # If response code is not ok (200), print the resulting http error code with description
        print('API Error')
        return ''
    
#Web Scrape experimentation - https://realpython.com/blog/
def screen_scrape(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(requests.get(url, stream=True)) as resp:
            if is_good_response(resp):
                print('http request successful')
                return resp.content
            else:
                return None

    except RequestException as e:
        print('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

#Create functions to Read/Write CSV & Excel Data using Pandas 
def file_read_csv(path):
    print("Pandas File I/O Example - CSV Read")
    #load csv file into Pandas dataframe object
    data=pd.read_csv(path)
    return data

def file_read_excel(path,sheet):
    print("Pandas File I/O Example - Excel Read")
    #load csv file into Pandas dataframe object
    xlsx = pd.ExcelFile(path)
    data = pd.read_excel(xlsx, sheet)
    return data

In [268]:
#CRISP-DM Step 2b & 2c - transform data to place in Pandas dataset (Step 2a), 
#for further understanding and model prep/usage (Step 2b)
#"get" data from API endpoint 
data = API_get('https://jsonplaceholder.typicode.com/todos')
print("Check Response - Print the title of the 10th entry in returned dictionary: {}".format(data[10]['title']))

#place python dictionary in pandas dataframe  
data_API = pd.DataFrame(data)
print(data_API.head())

#Step 2c - Evaluate API data for furter text (or other) analysis
print ("Row axis labels and column axis labels are:")
print(data_API.axes)
print ("The data types of each column are:")
print(data_API.dtypes)
print ("The shape of the object is: r x c")
print(data_API.shape)
print(data_API.describe())  #numerically describe columns in dataset

#Visually Graph & discover in another tool such as tableau - export transformed/cleaned data

The API Get Request Was Successful


Check Response - Print the title of the 10th entry in returned dictionary: vero rerum temporibus dolor
  completed  id                                              title  userId
0     False   1                                 delectus aut autem       1
1     False   2                 quis ut nam facilis et officia qui       1
2     False   3                                fugiat veniam minus       1
3      True   4                                   et porro tempora       1
4     False   5  laboriosam mollitia et enim quasi adipisci qui...       1
Row axis labels and column axis labels are:
[RangeIndex(start=0, stop=200, step=1), Index(['completed', 'id', 'title', 'userId'], dtype='object')]
The data types of each column are:
completed      bool
id            int64
title        object
userId        int64
dtype: object
The shape of the object is: r x c
(200, 4)
               id      userId
count  200.000000  200.000000
mean   100.500000    5.500000
s

In [269]:
#CRISP-DM Step 2b & 2c - transform data to place in Pandas dataset (Step 2a), 
#for further understanding and model prep/usage (Step 2b)
#Web Scrape experimentation - https://realpython.com/python-web-scraping-practical-introduction/
raw_html = screen_scrape('https://finance.yahoo.com/quote/AAPL/history?p=AAPL')
#using BeautifulSoap to parse html elements and scrape web page
html = BeautifulSoup(raw_html, 'html.parser')
# Take out the <div> of name and get the stock's value
stock_date = html.findAll("td", attrs={"class": "Py(10px) Ta(start) Pend(10px)"})

#iterate through elements to get stock dates
stock_date = []
for i, text in enumerate(html.findAll("td", attrs={"class": "Py(10px) Ta(start) Pend(10px)"})):
    stock_date.insert(i,text.span.string)
#print(stock_date)

#create list to store other stock data
stock_data = []
for i, text in enumerate(html.findAll("td", attrs={"class": "Py(10px) Pstart(10px)"})):
    stock_data.insert(i,text.span.string)
#print(stock_data)

#combine date and stock information into dictionary data structure
i = 0
stock_info = {}
print(range(len(stock_date)))
for i in range(len(stock_date)-2):  #get stock info for every day
    stock_info[i] = {"date": stock_date[i], "open": stock_data[(i*6)+0], "close": stock_data[(i*6)+3], "volume": stock_data[(i*6)+5]}
    #print(stock_info[i])
    
#place python dictionary in pandas dataframe
data_scrape = pd.DataFrame(stock_info)
print(data_scrape.head())  #probably not the row x column format we want!
data_scrape2 = data_scrape.T #transpose dataset to get column names "on top" of dataframe
print(data_scrape2.head())
print(data_scrape2.date) #get all dates in dataset

#Step 2c - Evaluate scraped data for furter text (or other) analysis
print ("Row axis labels and column axis labels are:")
print(data_scrape.axes)
print ("The data types of each column are:")
print(data_scrape.dtypes)
print ("The shape of the object is: r x c")
print(data_scrape.shape)
print(data_scrape.describe())  #numerically describe columns in dataset

#Visually Graph & discover in another tool such as tableau - export transformed/cleaned data

http request successful
range(0, 100)
                  0             1             2             3             4   \
close         170.18        170.89        169.43        170.41        170.94   
date    Feb 13, 2019  Feb 12, 2019  Feb 11, 2019  Feb 08, 2019  Feb 08, 2019   
open          171.39        170.10        171.05        168.99        172.40   
volume    22,479,799    22,229,900    20,993,400    23,820,000    31,741,700   

                  5             6             7             8             9   \
close         174.24        174.18        171.25        166.52        166.44   
date    Feb 07, 2019  Feb 06, 2019  Feb 05, 2019  Feb 04, 2019  Feb 01, 2019   
open          174.65        172.86        167.41        166.96        166.11   
volume    28,239,600    36,101,600    31,495,500    32,668,100    40,739,600   

            ...                 88            89            90            91  \
close       ...             224.29        227.99        232.07        229.28   


In [270]:
#CRISP-DM Step 2b & 2c - transform data to place in Pandas dataset (Step 2a), 
#for further understanding and model prep/usage (Step 2b)
#Pandas CSV Example
data_xls = file_read_excel('C:\\Python\\Data\\indicator_stock_hist.xls','^DJI') #older version of excel - can use new .xlsx
print(data_xls.head())

#Step 2c - Evaluate excel data for furter text (or other) analysis
print ("Row axis labels and column axis labels are:")
print(data_xls.axes)
print ("The data types of each column are:")
print(data_xls.dtypes)
print ("The shape of the object is: r x c")
print(data_xls.shape)
print(data_xls.describe())  #numerically describe columns in dataset

#Visually Graph & discover in another tool such as tableau - export transformed/cleaned data

Pandas File I/O Example - Excel Read
        Date     Open     High      Low    Close      Volume  Adj Close  \
0 2009-04-23  7886.81  8015.36  7762.80  7957.06  6563100000    7957.06   
1 2009-04-22  7964.78  8111.02  7802.46  7886.57  7327860000    7886.57   
2 2009-04-21  7841.73  8027.54  7699.79  7969.56  7436489600    7969.56   
3 2009-04-20  8128.94  8128.94  7801.58  7841.73  6973960000    7841.73   
4 2009-04-17  8125.43  8251.20  8024.92  8131.33  7352009600    8131.33   

  Symbol  log2_open  d1_cls_chg  d5_cls_chg  d10_cls_chg  d30_cls_chg  \
0    DJI   3.896901       70.49     -168.37       119.95      1026.66   
1    DJI   3.901174      -82.99     -143.05        97.01       960.08   
2    DJI   3.894412      127.83       49.38        -6.29      1422.51   
3    DJI   3.910034     -289.60     -216.08      -175.86      1214.79   
4    DJI   3.909846        5.90       47.95       153.25      1536.89   

    d1_vol_chg  d5_cls_chg.1  d10_cls_chg.1  d30_cls_chg.1  d1_cls_pcnt_c

In [271]:
#CRISP-DM Step 2b & 2c - transform data to place in Pandas dataset (Step 2a), 
#for further understanding and model prep/usage (Step 2b)
#Pandas CSV Example - Step 2b
data_csv = file_read_csv('C:\\Python\\Data\\Text_Mining_Sample_CSV.csv') #pandas read_csv function automatically returns a dataframe!
print(data_csv.head())

#Step 2c - Evaluate csv data for furter text (or other) analysis
print ("Row axis labels and column axis labels are:")
print(data_csv.axes)
print ("The data types of each column are:")
print(data_csv.dtypes)
print ("The shape of the object is: r x c")
print(data_csv.shape)
print(data_csv.describe())  #numerically describe columns in dataset
print(data_csv['Comment'].isnull()) #Check for missing values'

#Visually Graph & discover in another tool such as tableau - export transformed/cleaned data

Pandas File I/O Example - CSV Read
        ID  Branch Interview Date Transaction Date Comment Type  \
0  4278642     355      2/24/2015        2/23/2015   Compliment   
1  3329834     311      4/10/2014         4/9/2014   Compliment   
2  4182303     353      1/14/2015        1/13/2015   Compliment   
3  4228554     318       2/4/2015         2/3/2015   Compliment   
4  3860433     351       9/4/2014         9/3/2014   Compliment   

                                             Comment Follow-up  \
0  MY BANK is always good to me. I have banked wi...       NaN   
1  MY BANK is the best for me. They help people w...       NaN   
2  MY BANK has been 100 percent on top on any ban...       NaN   
3  Absolutely no problems with them. Everything h...       NaN   
4  Absolutely. They are efficient, courteous and ...       NaN   

   Satisfaction Rating  
0                   10  
1                    8  
2                    8  
3                    9  
4                    9  
Row axis labels

In [272]:
#Send Data to uclassify API, for model build & evaluation
#request format:   
#curl -X POST -H "Authorization:Token YOUR_READ_API_KEY_HERE" -H "Content-Type: application/json" --data "{insert here}" 
def API_put_uclassify(data):
    api_key = 'uMxNaxJ9eYVL'
    headers = {'Content-type': 'application/json', 'Authorization': 'Token ' + api_key}
    view_url = 'https://api.uclassify.com/v1/uclassify/sentiment/classify'
    print('API Put - convert dict data structure to json') 
    json2 = json.dumps(data) 
    #print(json2)
    myResponse = requests.post(view_url, data=json2, headers=headers)
    print (myResponse.status_code)
    # For successful API call, response code will be 200 (OK)
    if(myResponse.status_code == 200):
        print("The API Put Request Was Successful")
        return myResponse
    else:
        #If response code is not ok (200), print the resulting http error code with description
        print("The API Put Request Was Not Successful")
        return myResponse

In [273]:
api_key = 'uMxNaxJ9eYVL'
headers = {'Content-type': 'application/json', 'Authorization': 'Token ' + api_key}
print(headers)

{'Content-type': 'application/json', 'Authorization': 'Token uMxNaxJ9eYVL'}


In [274]:
#step 3a) Build Model, then 3b) test & evaluate
#note - for UClassify, split into test/train dataset, then compare results
#in general - will build a model using data science libraries such as Scikit-Learn, then use corresponding test statistics
#first, get text-only data from set(s) obtained in CRISP-DM step 2 - Convert to Dict Structure
text_dict = data_csv['Comment'].to_dict()
#print(text_dict[0])
uclassify_text_all = ""
uclassify_text_test1 = ""
uclassify_text_test2 = ""
#step 3b) split pandas dataset into test/train, to evaluate model quality on dataset 
for key in text_dict:
    uclassify_text_all += text_dict[key]
    if random() < .5:
        uclassify_text_test1 += text_dict[key]
    else:
        uclassify_text_test2 += text_dict[key]

#preprocess text (if wanted/required), then place in dictioary for conversion to json in api call
uclassify_dict_all = {}
uclassify_dict_all["texts"]=[uclassify_text_all]

uclassify_dict_test1 = {}
uclassify_dict_test1["texts"]=[uclassify_text_test1]

uclassify_dict_test2 = {}
uclassify_dict_test2["texts"]=[uclassify_text_test2]

#print(json.dumps(uclassify_dict))  <-- how to convert diction to json data structure
#print(uclassify_dict_all)
#print(uclassify_dict_test1)
#print(uclassify_dict_test2)

#step 3a/b) u-classify models are pre-trained/built, so call rest api to run model on data 
json1 = API_put_uclassify(uclassify_dict_all)
json_all = json.loads(json1.text)[0] #returs dictionary
print(json_all) #returned json response

json2 = API_put_uclassify(uclassify_dict_test1)
json_test1= json.loads(json2.text)[0] #returs dictionary
print(json_test1) #returned json response

json3 = API_put_uclassify(uclassify_dict_test2) 
json_test2= json.loads(json3.text)[0] #returs dictionary
print(json_test2) #returned json responsey

API Put - convert dict data structure to json
200
The API Put Request Was Successful
{'textCoverage': 0.938696, 'classification': [{'className': 'negative', 'p': 0.787717}, {'className': 'positive', 'p': 0.212283}]}
API Put - convert dict data structure to json
200
The API Put Request Was Successful
{'textCoverage': 0.950031, 'classification': [{'className': 'negative', 'p': 0.784199}, {'className': 'positive', 'p': 0.215801}]}
API Put - convert dict data structure to json
200
The API Put Request Was Successful
{'textCoverage': 0.948675, 'classification': [{'className': 'negative', 'p': 0.790501}, {'className': 'positive', 'p': 0.209499}]}


In [275]:
#step 6a) Store results of model build, for evaluation purposes
#create list, using [classifier, trial_name, text_coverage, negative_p, positive_p] convention
trial_all = ['Sentiment', 'All', 1, json_all['textCoverage'], json_all['classification'][0]['p'], json_all['classification'][1]['p']]
print(trial_all) #returned json response
trial_test1 = ['Sentiment', 'Test1', 1, json_test1['textCoverage'], json_test1['classification'][0]['p'], json_test1['classification'][1]['p']]
print(trial_test1) #returned json response
trial_test2 = ['Sentiment', 'Test2', 1, json_test2['textCoverage'], json_test2['classification'][0]['p'], json_test2['classification'][1]['p']]
print(trial_test2) #returned json response

['Sentiment', 'All', 1, 0.938696, 0.787717, 0.212283]
['Sentiment', 'Test1', 1, 0.950031, 0.784199, 0.215801]
['Sentiment', 'Test2', 1, 0.948675, 0.790501, 0.209499]


In [276]:
#step 6b) export results of analysis to google fusion table, for use in data products
#Step 1 - Identify Resource: set url for json request
view_url = 'https://accounts.google.com/o/oauth2/token'
#Step 2 & 3 - Identify Endpoints, Methods and Set request parameters
client_id='615800458288-5fktbjo6kmu18bpgl18glnj2l12mvg8i.apps.googleusercontent.com'
client_secret='QowLGwV4wueqHpV_t1sotQVh'
refresh_token = '1/vz7AvnbcT05ZJI-SQcFACqhaif9hXCfeuTM8n1DI12E'
refresh_body = "refresh_token=" + refresh_token + '&client_id=' + client_id + '&client_secret=' + client_secret + '&grant_type=refresh_token'

#Step 1-3) Google Fusion Tables OAuth keys
def API_POST_OAuth():
    #set url for json request, then obtain json response/payload
    headers = {'Content-type': 'application/x-www-form-urlencoded', 'Accept': 'text/plain'}
    myResponse = requests.post(view_url, data=refresh_body, headers=headers)
    # For successful API call, response code will be 200 (OK)
    if(myResponse.ok):
        # Loads (Load String) takes a Json file and converts into python data structure 
        # (dict or list, depending on JSON structure and number of records returned
        jData = myResponse.content 
        # Loading the response data into a dict variable
        jData = json.loads(jData) 
        print("The API Get Request Was Successful")
        print("\n")
        return jData
        
    else:
        # If response code is not ok (200), print the resulting http error code with description
        print('API Error')
        return ''

#Step 1-3) Google Fusion Table - Replace
def API_POST_Data(access_token, table_id, data):
    data_view_url = "https://www.googleapis.com/upload/fusiontables/v2/tables/" + table_id + '/import?&access_token=' + access_token + '&isStrict=false'
    headers = {'Content-type': 'application/octet-stream'}
    #create post response body - comma seperated string of all obs
    data2 = data[0] + ', ' + data[1] + ', ' + str(data[2]) + ', ' + str(data[3]) + ', ' + str(data[4]) + ', ' + str(data[5]) +'\n'
    myResponse = requests.post(data_view_url, data=data2, headers=headers)
    print (myResponse.status_code)
    # For successful API call, response code will be 200 (OK)
    if(myResponse.status_code == 200):
        return "The API Post Request Was Successful"
    else:
        #If response code is not ok (200), print the resulting http error code with description
        return "The API Post Request Was Not Successful"

#step 4 - create request data structure from csv file, to replace data in the "OSDA Stock History" google fusion table
#https://fusiontables.google.com/data?docid=1a8EPfomscPkMYksFrlGoyU4utoT0QdLCpH9tySDP#rows:id=1 
#Obtain Google Fusion OAuth token 
data = API_POST_OAuth()
access_token = data['access_token']

#Put data in google fusion table
table_id = '1tTpKKRw9Fvo0fh2Pw7v2R8L9fgPOlK2bq7KMaS70'
#convert list results from model to dictionary - data = {trial_all, trial_test1, trial_test2}
status = API_POST_Data(access_token, table_id, trial_all)  #"put" data using API class
print(status)
status = API_POST_Data(access_token, table_id, trial_test1)  #"put" data using API class
print(status)
status = API_POST_Data(access_token, table_id, trial_test2)  #"put" data using API class
print(status)

The API Get Request Was Successful


200
The API Post Request Was Successful
200
The API Post Request Was Successful
200
The API Post Request Was Successful
