Wikimedia API

In [17]:
import requests
import json

In [18]:
#getting data on Alien Film Franchise from Wiki

def get_wikipedia_pages():
    search_term = "Alien film franchise"
    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch={search_term}"
    
    response = requests.get(url)
    data = response.json()
    
    pages = []
    
    if "query" in data and "search" in data["query"]:
        search_results = data["query"]["search"]
        
        for result in search_results:
            title = result["title"]
            snippet = result["snippet"]
            page_id = result["pageid"]
            
            # Create a dictionary for each page and add it to the list
            page_info = {
                "title": title,
                "snippet": snippet,
                "page_id": page_id
            }
            pages.append(page_info)
    
    return pages

# Call the function to get the list of Wikipedia pages
alien_pages = get_wikipedia_pages()

# Save the data to a JSONL file
with open("alien_pages.jsonl", "w") as jsonl_file:
    for page in alien_pages:
        jsonl_file.write(json.dumps(page) + "\n")

print("Data saved as alien_pages.jsonl successfully.")



Data saved as alien_pages.jsonl successfully.


In [23]:
# writing data to a file

with open('alien_pages.jsonl', 'r') as input_file,\
    open("GetAlienPages.jsonl", 'w') as output_file:
    
    for line in input_file.readlines():
        line_dict = json.loads(line)
        page_title = line_dict["title"]
        
        print(f"now working on: {page_title}")
        api_answers = get_article_revision_json(page_title)
        for api_answer in api_answers:
            print(json.dumps(api_answer), file=output_file)


now working on: Alien (franchise)
now working on: Untitled Alien film
now working on: List of Alien (franchise) novels
now working on: Alien vs. Predator
now working on: Predator (franchise)
now working on: Prometheus (2012 film)
now working on: Alien: Covenant
now working on: Alien vs. Predator (film)
now working on: Aliens (film)
now working on: Alien (film)


In [63]:
# re-coding so we can read the data 
revisions = []

with open("GetAlienPages.jsonl", 'r') as input_file:
    for line in input_file.readlines():
        api_answer = json.loads(line)
        
        pages = api_answer["query"]["pages"]
        
        for page_id in pages.keys():
            query_revisions = pages[page_id]["revisions"]
            title = pages[page_id]['title']
            
            for rev in query_revisions:
                if "userhidden" in rev:
                    continue
                    
                rev["title"] = title # add a key called title 
                
                if "anon" in rev:
                    rev["anon"] = True
                else:
                    rev["anon"] = False
                    
                if "minor" in rev:
                    rev["minor"] = True
                else:
                    rev["minor"] = False
                    
                rev["timestamp"] = rev["timestamp"].replace("T", "")
                rev["timestamp"] = rev["timestamp"].replace("Z", "")
                
                revisions.append(rev)

In [64]:
num_edits = len(revisions)
num_edits

33854

In [65]:
# anonymous edits 

num_anon = 0

for rev in revisions:
    if rev["anon"]:
        num_anon = num_anon + 1
        
prop_anon = num_anon / num_edits

print(f"{prop_anon}% of the edits were made by users without accounts")


0.35682637206829326% of the edits were made by users without accounts


In [66]:
# minor edits 

num_minor = 0

for rev in revisions:
    if rev["minor"]:
        num_minor = num_minor + 1
        
prop_minor = num_minor / num_edits

print(f"{prop_minor}% of the edits were minor")


0.19170555916582974% of the edits were minor


In [67]:
# visualization 

edits_by_day = {}
for rev in revisions:
    day_string = rev['timestamp'][0:10]

    if day_string in edits_by_day.keys():
        edits_by_day[day_string] = edits_by_day[day_string] + 1
    else:
        edits_by_day[day_string] = 1

In [68]:
with open("EditOverTime.tsv", "w", encoding='utf-8') as output_file:
    print("date\tedits", file=output_file)

    for day_string in edits_by_day.keys():
        print("\t".join([day_string, str(edits_by_day[day_string])]), file=output_file)

https://docs.google.com/spreadsheets/d/1YJYz4E7tth8NZoIApH6S3sfR7CGt1OZyU2NWgmHWes4/edit?usp=sharing

FANDOM

In [69]:
# collecting data on Alien Franchise from Fandom API

def get_article_revision_json(title):
    api2_answers = []
    
    fandom_api_url = "https://alienanthology.fandom.com/api.php" 
    
    parameters = {'action' : 'query',
                  'titles' : title,
                  'prop' : 'revisions',
                  'rvprop' : 'flags|timestamp|user|size|ids',
                  'rvlimit' : 500,
                  'format' : 'json'
                   }
    
    while True: 
        
        call = requests.get(fandom_api_url, params=parameters)
        api2_answer = call.json()
        
        api2_answers.append(api2_answer)
        
        if 'continue' in api2_answer.keys():
            parameters.update(api2_answer['continue'])
        else:
            break

    return(api2_answers)


In [70]:
# reading the file and adding the new data to it
with open('alien_pages.jsonl', 'r') as input_file,\
    open("alienPages_revisions.jsonl", 'w') as output_file:
    
    for line in input_file.readlines():
        line_dict = json.loads(line)
        page_title = line_dict["title"]
        
        print(f"now working on: {page_title}")
        api2_answers = get_article_revision_json(page_title)
        for api2_answer in api2_answers:
            print(json.dumps(api2_answer), file=output_file)



now working on: Alien (franchise)
now working on: Untitled Alien film
now working on: List of Alien (franchise) novels
now working on: Alien vs. Predator
now working on: Predator (franchise)
now working on: Prometheus (2012 film)
now working on: Alien: Covenant
now working on: Alien vs. Predator (film)
now working on: Aliens (film)
now working on: Alien (film)


In [73]:
# revisions that failed this time (idk why)
revisionsFantom = []

with open("alienPages_revisions.jsonl", 'r') as input_file:
    for line in input_file.readlines():
        api_answer = json.loads(line)
        
        # get the list of pages from the json object
        pages = api_answer["query"]["pages"]

        # for every page, (there should always be only one) get its revisions:
        for page_id in pages.keys():
            query_revisions = pages[page_id]["revisionsFantom"]
            title = pages[page_id]['title']

            # for every revision, first we do some cleaning up
            for rev in query_revisions:
                #print(rev)
                # let's continue/skip this revision if the user is hidden
                if "userhidden" in rev.keys():
                    continue
                
                # 1: add a title field for the article because we're going to mix them together
                rev["title"] = title

                # 2: let's "recode" anon so it's true or false instead of present/missing
                if "anon" in rev.keys():
                    rev["anon"] = True
                else:
                    rev["anon"] = False

                # 3: let's recode "minor" in the same way
                if "minor" in rev.keys():
                    rev["minor"] = True
                else:
                    rev["minor"] = False

                # we're going to change the timestamp to make it work a little better in excel/spreadsheets
                rev["timestamp"] = rev["timestamp"].replace("T", " ")
                rev["timestamp"] = rev["timestamp"].replace("Z", "")

                # finally, save the revisions we've seen to a varaible
                revisions.append(rev)

KeyError: 'revisionsFantom'

Hmmm isn't this the same code that mako used and I used for the other one before? What's wrong?

3_ Progress on Final Project

1- my project's end result will not be a data visulization table, I will use the data I collect to generate speculative data.

2- I will collect written data on Islamic Artifacts and Visual data on them.

3_ a) I think only one call from my API's will provide me with more that enough data to start with
   b) I'm not using this but I think something similar to what Mako showed with the continue in the code will be used for this (probably depending on the API documentation)
   


In [43]:
import requests
import json

def get_islamic_artifacts():
    search_term = "Islamic artifacts"
    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch={search_term}"
    
    response = requests.get(url)
    data = response.json()
    
    artifacts = []
    
    if "query" in data and "search" in data["query"]:
        search_results = data["query"]["search"]
        
        for result in search_results:
            title = result["title"]
            snippet = result["snippet"]
            page_id = result["pageid"]
            
            # Create a dictionary for each artifact and add it to the list
            artifact_info = {
                "title": title,
                "snippet": snippet,
                "page_id": page_id
            }
            artifacts.append(artifact_info)
    
    return artifacts

# Call the function to get the list of Islamic artifacts
islamic_artifacts = get_islamic_artifacts()

# Save the data to a JSONL file
with open("islamic_artifacts.jsonl", "w") as jsonl_file:
    for artifact in islamic_artifacts:
        jsonl_file.write(json.dumps(artifact) + "\n")

print("Data saved as islamic_artifacts.jsonl successfully.")

Data saved as islamic_artifacts.jsonl successfully.


5_ 3.1 kB
This is only for a couple of pages that this call collected for me, the file I'll be using will probably be around 4mB (based on the data texts that i have collected in my previous projects?) or more. I don't think this is going to be a problem since this is not really a very large file for python. (I assume)