# JSON and CSV

Last week we talked about the the dictionary structure in Python.  How it can be used to collect data in key/value pairs.  These pairs can be quite valuable, particularly when you have many things to count.  So instead of having many separate accumulator structures, you can neatly store them all in a dictionary and let it automatically grow to hold more things as you are going over your content.

Now we're going to go through and try to discover looking at it in context along with how these can be used with CSVs.  

It is often the case that you are given a dataset in JSON format, but you need to get it into a CSV or rectangular format for analysis.  This is what you'll be doing for your homework, and we'll be walking through a few key points about each along the way.

In [1]:
import requests
import json
import time
from bs4 import BeautifulSoup
import statistics
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

In [2]:


def create_search_url(term, offset):
    left = "https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch="
    right = "&srwhat=text&srlimit=500&format=json"
    sanitize = "%20".join(term.split())
    offsetchunk = "&sroffset=" + str(offset) 
    return left + sanitize + right + offsetchunk

def get_pages(term):
    results = []
    offset = 0
    cont = True
    while cont:
        url = create_search_url(term, offset)
        print(url)
        r = requests.get(url)
        datadump = json.loads(r.text)

        if 'continue' in datadump:
            results.append(datadump['query']['search'])
            offset = datadump['continue']['sroffset']
            time.sleep(3)
        else:
            results.append(datadump['query']['search'])
            time.sleep(3)
            cont = False 
    return results
            
def get_clean_snippets(chunks):
    snippets = []

    for chunk in chunks:
        subset = [BeautifulSoup(d['snippet'], "lxml").text for d in chunk]
        snippets += subset
    
    return snippets

def get_sentiment_stats(snippets):
    senti = []

    for s in snippets:
        blob = TextBlob(s, analyzer = NaiveBayesAnalyzer())
        senti.append(blob.sentiment.polarity)
    
    return statistics.mean(senti), statistics.stdev(senti)

def write_results(pageresults, filename):
    j = []

    for chunk in pageresults:
        for d in chunk:
            j.append(d)

    with open(filename, 'w') as fout:
        json.dump(j, fout, indent=2)

In [3]:
pages = get_pages("cocker spaniel AND dog")

https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=cocker%20spaniel%20AND%20dog&srwhat=text&srlimit=500&format=json&sroffset=0


In [4]:
print(len(get_clean_snippets(pages)))

337


In [5]:
len(pages)

1

In [6]:
write_results(pages, 'cockerspanielresults.json')

In [7]:
with open('alljsonresults.json', 'w') as fout:
    json.dump(j, fout, indent=2)

NameError: name 'j' is not defined

In [None]:
from textblob import TextBlob


senti = []

for s in snippets:
    blob = TextBlob(s)
    senti.append(blob.sentiment.polarity)
    

In [None]:
import statistics

In [None]:
print(statistics.stdev(senti))

In [None]:
print(statistics.mean(senti))

In [None]:
pitbull = get_clean_snippets(get_pages("husky"))

In [None]:
pbmean, pbsd = get_sentiment_stats(pitbull)

In [None]:
pbmean, pbsd

In [None]:
corgi = get_clean_snippets(get_pages("cocker spaniel AND dog"))

In [None]:
shibamean, shibasd = get_sentiment_stats(get_clean_snippets(pages))

In [None]:
shibamean, shibasd

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.scatter([1,2,3], [4,5,6])

In [None]:
senti = []

for s in get_clean_snippets(pages):
    blob = TextBlob(s, analyzer = NaiveBayesAnalyzer())
    pos = blob.sentiment.p_pos
    print(pos, blob)
    

In [None]:
senti[0]

In [None]:
pos = [s[0] for s in senti]
neg = [s[1] for s in senti]

plt.scatter(pos, neg)

In [None]:
senti