## 1 Wikipedia Page View API

In [12]:
import requests
import json

#### 1. Identify a famous person who has been famous for at least a few years and that you have some personal interest in. Use the Wikimedia API to collect page view data from the English Wikipedia article on that person. Now use that data to generate a time-series visualization and include a link to it in your notebook.

In [10]:
def get_wikipedia_pageviews (page_title):

    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/user/{page_title}/daily/20191101/20230428"
    
    headers = {
        'User-Agent': 'python data collection bot by jvera@uw.edu'
    }
    
    response = requests.get(url, headers=headers)

    if not response.status_code == 200:
        print("ERROR. Request not OK.")
    
    data = response.json()
    return data

pages = ["Pedro_Pascal"]

with open("pedro_pascal_views.jsonl", "w") as file:
    for page_title in pages:
        data = get_wikipedia_pageviews(page_title)
        json_string = json.dumps(data)
        print(json_string, file=file)
        #print(data)
        
#I need to turn this into a file that can be read by GDocs

day_dicts = []
with open("pedro_pascal_views.jsonl", "r") as input_file:
    for line in input_file.readlines():
        new_data = json.loads(line)
        day_dicts = day_dicts + new_data['items']

def clean_up_timestamp(day): #timestamp cleanup
    new_time_stamp = day[0:4] + "-" + day[4:6] + "-" + day[6:8]
    return new_time_stamp

views_by_day = {}
for day_dict in new_data['items']:
    day = clean_up_timestamp(day_dict['timestamp'])
    views_by_day[day] = day_dict['views']

#print(day_dicts)
    
with open('pedro_pascal_views.tsv', 'w') as output_file:
    print("day\tviews", file = output_file)
    for day in views_by_day:
        print(day, "\t", views_by_day[day], file=output_file)

#GOOGLEDOC LINK
#https://docs.google.com/spreadsheets/d/1WoI5rpWkowKyNCiLtGBPfo9YoNkRV-nlp7veDHEfSZA/edit?usp=sharing

#### 2. Identify 2 other languages editions of Wikipedia that have articles on that person. Collect page view data on the article in other languages and create a single visualization that shows how the dynamics and similar and/or different. (Note: My approach involved creating a TSV file with multiple columns.)

In [16]:
def get_wikipedia_pageviews_intl(project, page_title):

    url_intl = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{project}/all-access/user/{page_title}/daily/2011101/20230428"
    #print(url_intl)
    headers = {
        'User-Agent': 'python data collection bot by jvera@uw.edu'
    }
    
    response = requests.get(url_intl, headers=headers)

    if not response.status_code == 200:
        print("ERROR. Request not OK.")
        
    data_intl = response.json()
    return data_intl


page_title = "Pedro_Pascal"
locations_intl = ["en.wikipedia.org"] 
#"de.wikipedia.org", "en.wikipedia.org" was going to have it loop thru the list

with open("pedro_pascal_views_intl_en.jsonl", "w") as file_intl:
    #print("entering loop")
    for project_intl in locations_intl:
        #print(f"getting data for {project_intl}")
        data_intl = get_wikipedia_pageviews_intl(project_intl, page_title)
        json_string_intl = json.dumps(data_intl)
        #print(f"creating file for {project_intl} {page_title}")         
        print(json_string_intl, file=file_intl)

print(json_string_intl)
#begin processing

day_dicts_intl = []
with open("pedro_pascal_views_intl_en.jsonl", 'r') as input_file_intl:
    for line_intl in input_file_intl.readlines():
        new_data_intl = json.loads(line_intl)
        day_dicts_intl = day_dicts_intl + new_data_intl['items']

#timestamp cleanup
def clean_up_timestamp_intl(day_intl):
    new_time_stamp_intl = day_intl[0:4] + "-" + day_intl[4:6] + "-" + day_intl[6:8]
    return new_time_stamp_intl

views_by_day_intl = {}
for day_dict_intl in new_data_intl['items']:
    day_intl = clean_up_timestamp_intl(day_dict_intl['timestamp'])
    views_by_day_intl[day_intl] = day_dict_intl['views']


#print(views_by_day_intl)

#write file
with open('pedro_pascal_views_intl_en.tsv', 'w') as output_file:
    print("day\tviews", file=output_file)
    for day_intl in views_by_day_intl:
        print(f"{day_intl}\t{views_by_day_intl[day_intl]}", file=output_file)

#How would i add multiple values to a single key? not possible with dict? 
#OK, so I'll do this three times and make 3 separate files, not sure how to do this more efficiently

#GoogleDoc: https://docs.google.com/spreadsheets/d/1JRMEccVzvaiqvi_Uc_BeJnbo2LuCOzPBYDtXfxBb0zI/edit?usp=sharing

#EN has a lot more pageviews than FR and DE! Spikes in EN pretty much coincide with the spikes in FR and DE.


#### 3. Collect page view data on the articles about Marvel Comics and DC Comics in English Wikipedia. (If you'd rather replace these examples with some other comparison of popular rivals, that's just as good!)
###### Which has more total page views in 2022?
###### Can you draw a visualization in a spreadsheet that shows this? (Again, provide a link.)
###### Were there any years when 2022's more popular page was instead the less popular of the two? How many and which ones?
###### Were there any months was this reversal of relative popularity occurred? How many and which ones?
###### How about any days? How many?


In [None]:
def get_wikipedia_pageviews_comics (page_title):

    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/user/{page_title}/daily/20220101/20221231"
    
    headers = {
        'User-Agent': 'python data collection bot by jvera@uw.edu'
    }
    
    response = requests.get(url, headers=headers)

    if not response.status_code == 200:
        print("ERROR. Request not OK.")
    
    data = response.json()
    return data

pages = ["Marvel_Comics"] # also"Marvel_Comics" 

with open("marvel_comics.jsonl", "w") as file:
    for page_title in pages:
        data = get_wikipedia_pageviews_comics(page_title)
        json_string = json.dumps(data)
        print(json_string, file=file)
        #print(data)
        
#I need to turn this into a file that can be read by GDocs

day_dicts = []
with open("marvel_comics.jsonl", "r") as input_file:
    for line in input_file.readlines():
        new_data = json.loads(line)
        day_dicts = day_dicts + new_data['items']

def clean_up_timestamp(day): #timestamp cleanup
    new_time_stamp = day[0:4] + "-" + day[4:6] + "-" + day[6:8]
    return new_time_stamp

views_by_day = {}
for day_dict in new_data['items']:
    day = clean_up_timestamp(day_dict['timestamp'])
    views_by_day[day] = day_dict['views']

#print(day_dicts)
    
with open('marvel_comics.tsv', 'w') as output_file:
    print("day\tviews", file = output_file)
    for day in views_by_day:
        print(day, "\t", views_by_day[day], file=output_file)

#GOOGLEDOC LINK
#https://docs.google.com/spreadsheets/d/1eQozhTqMhOfWDFr6PsssKzaodILCcJJ786fv3bp7YXQ/edit?usp=sharing

# Q1. Most pageviews? Marvel. (Total = 3,597,777, DC = 1,793,582, Marvel = 1,804,195)
# Q2/Q3. I pulled 2022 only, so lets go with months on Q2. Seems like a half-and-half split. DC was 
# consistently gettng more views than Marvel in Q4 of last year.

#### 4. Alt-Rock bands from WA state
##### Download the file (click "raw" and then save the file onto your drive). Now read it in, and request monthly page view data from all of them. If you need some help with loading it in, I've included some sample code at the bottom of this page.
##### Once you've done this, sum up all of the page views from all of the pages and print out a TSV file with these total numbers.
##### You know the routine by now! Now, make a time series graph of these numbers and include a link in your notebook.

In [184]:
### create a list of bands
band_dict = []
with open("list_of_washington_alternative_rocks_bands_wikipedia-2023-04-25.jsonl", "r") as input_file:
    for line in input_file.readlines():
        new_data = json.loads(line)
        band_dict.append(new_data['page_title'])
#print (band_dict)

#go get the pageview data
def get_band_pageviews (page_title):

    #we're just going to do this for the last 3 months
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/user/{page_title}/daily/20230201/20230501"
    
    headers = {
        'User-Agent': 'python data collection bot by jvera@uw.edu'
    }
    
    response = requests.get(url, headers=headers)

    if not response.status_code == 200:
        print("ERROR. Request not OK.")
    
    data = response.json()
    return data


# FOR DEVELOPMENT PURPOSES, LIMIT TO 3 BANDS
pages = band_dict[:3]
#type(pages)


output_dict = {}
with open("band_list_pageviews.json", "w") as my_file:
    for page_title in pages:
        # get the data for each band and build a dictionary
        data = get_band_pageviews(page_title)
        output_dict[page_title] = data
    # write the huge dict into a file
    json_string = json.dumps(output_dict)
    print(json_string, file=my_file)

### Processss

band_list = []
with open("band_list_pageviews.json", "r") as input_file:
    for line in input_file.readlines():
        new_data = json.loads(line)
        band_list = band_list + new_data['page_title'] 
        ####i got lost here because my keys are page titles

views_by_band = {}
for band_list in new_data['page_title']:
    views_by_day = day_dict['views']

with open('band_list_pageviews.tsv', 'w') as output_file:
    print("day\tviews", file = output_file)
    for day in views_by_day:
        print(day, "\t", views_by_day[day], file=output_file)

KeyError: 'items'

### 2 - Starting on my project

##### q1 - I am working with Reddit data and will use the Pushshift API per your suggestions
##### q2 - Documentation: https://pushshift.io/api-parameters/
##### q3 - There are three main endpoints:/reddit/search/comment/, /reddit/search/submission/. I'm not 100% sure of the scope of my project, but at minimum, I plan on using: q, mod_removed, parent_id, link_id, nest_level, subreddit_subscribers, description.
##### q4 - Yes, looks like 'praw'
##### q5 - nope!
##### q6 - yes, 60 requests per minute
##### q7 - see below!


In [181]:
import praw
import requests

query="science" #Define query
url = f"https://api.pushshift.io/reddit/search/comment/?q={query}"
request = requests.get(url)
json_response = request.json()
json_response

{'data': [{'subreddit_id': 't5_2ud49',
   'author_is_blocked': False,
   'comment_type': None,
   'edited': False,
   'author_flair_type': 'text',
   'total_awards_received': 0,
   'subreddit': 'PlantBasedDiet',
   'author_flair_template_id': None,
   'id': 'jifn4d5',
   'gilded': 0,
   'archived': False,
   'collapsed_reason_code': None,
   'no_follow': True,
   'author': 'Charlieinhisownworld',
   'send_replies': True,
   'parent_id': 42471634649,
   'score': 1,
   'author_fullname': 't2_cq6bbe52',
   'all_awardings': [],
   'body': "Carbs are not the enemy nor do they cause diabetes.  The fats cause diabetes and calories take you up and down the scale. We do not need to eat all the time and if you feel that you are hungry all the time you may want to start evaluating the micronutrients that you are or are not consuming.  Lots of greens and large salads were key for me. Monomeals also played a huge part in me defeating my food addictions. Tofu is a great source of protein but we get 