### Scraping the Tarkov wiki for information on tasks, hideout requirements, and trader information

General list of all the tasks

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time

task_URL = "https://escapefromtarkov.fandom.com/wiki/Quests"
task_page = requests.get(task_URL)
tasks = BeautifulSoup(task_page.content, "html.parser")
task_table = tasks.find_all("tr")

In [26]:
import pandas as pd
# pd.set_option("display.max_rows", 50)
pd.set_option("display.max_rows", None)

In [3]:
task_df = pd.DataFrame()
name_arr = [None] * len(task_table)
type_arr = [None] * len(task_table)
link_arr = [None] * len(task_table)
list_req_arr = [None] * len(task_table)
kappa_req_arr = [None] * len(task_table)

# Looking through all tasks on the main EFT wiki quests page
for x in range(len(task_table)):
    list_req = []
    # Get the name of the task
    try:
        name = task_table[x].find_all("th")[0].get_text().replace("\n","")
    except:
        temp = "nothing exists"
    
    name_arr[x] = name


    # Get the type of the task
    try:
        type = task_table[x].find_all("th")[1].get_text().replace("\n","")
    except:
        type = "none"
    
    type_arr[x] = type


    # Get the link for the task
    try:
        link = task_table[x].find("a", href=True)["href"]
    except:
        link = "none"
    
    link_arr[x] = link


    # Get task requirements
    # array of requirements
    try:
        list_req = [None] * len(task_table[x].find("td").find_all("li"))
        for y in range(len(task_table[x].find("td").find_all("li"))):
            list_req[y] = task_table[x].find("td").find_all("li")[y].get_text().replace("\n","") 
    except:
        list_req = []
    
    list_req_arr[x] = list_req


    # Find if task is required for Kappa
    try:
        last = len(task_table[x].find_all("th")) - 1
        kappa_req = task_table[x].find_all("th")[last].get_text().replace("\n","")
    except:
        kappa_req = "none"

    kappa_req_arr[x] = kappa_req


# Adding information to the task_df
task_df["Task Name"] = name_arr
task_df["Link to task"] = link_arr
task_df["Type"] = type_arr
task_df["Requirements"] = list_req_arr
task_df["Required for Kappa"] = kappa_req_arr

# This current scraps unnessary information, this removes that, more tasks added in the future will break this
task_df.drop(index=range(360,len(task_df)), axis=0, inplace=True)

In [4]:
main_wiki_link = "https://escapefromtarkov.fandom.com"
next_task_arr = [None] * len(task_df)
level_req_arr = [None] * len(task_df)

for i in range(len(task_df)):
    task = []
    temp_link = main_wiki_link + task_df.iloc[i]["Link to task"]
    individual_page = requests.get(temp_link)
    individual = BeautifulSoup(individual_page.content, "html.parser")
    if task_df.iloc[i]["Link to task"] != "none":
        info_boxes = individual.find_all("td", {"class" : "va-infobox-content"})
        index = None
        for j, td in enumerate(info_boxes):
            if "Leads to:" in td.get_text():
                index = j
                break
        try:
            leads = info_boxes[index].contents
            leads_list = [None] * len(leads)
            for w in range(1, len(leads)):
                if leads[w].get_text() != "":
                    leads_list[w] = leads[w].get_text()
            
            for lead in leads_list:
                if lead != None:
                    if lead != "-":
                        if "(" not in lead:
                            task.append(lead)
            
        except:
            task = []
    else:
        task = []
    next_task_arr[i] = task

    all_h2 = individual.find_all("h2")
    checker = all_h2[2].find(id="Requirements")
    level_req = None
    if checker != None:
        level_string = all_h2[2].find_next_sibling().get_text()
        level = re.findall(r'\d+', level_string)
        try:
            level_req = level[0]
        except:
            level_req = None

    level_req_arr[i] = level_req
    # print("Completed " + str(i + 1) + " out of " + str(len(task_df)) + ": " + str(temp_link))

task_df["Leads to:"] = next_task_arr
task_df["Level Requirement"] = level_req_arr

In [5]:
trader_arr = ["Prapor","Therapist","Skier","Peacekeeper","Mechanic","Ragman","Jaeger","Fence","Lightkeeper"]
trader_row_arr = [None] * len(trader_arr)

for trader in range(len(trader_arr)):
    trader_row_info = task_df[task_df["Link to task"].str.contains(trader_arr[trader])]
    trader_row_arr[trader] = trader_row_info.index[0]

In [6]:
trader_df = pd.DataFrame()
trader_link_arr = [None] * len(trader_row_arr)

for p in range(len(trader_row_arr)):
    trade = trader_row_arr[p]
    testing = task_df.at[trade, "Link to task"]
    trader_link_arr[p] = testing

trader_df["Trader Name"] = trader_arr
trader_df["Link to trader"] = trader_link_arr
trader_df

Unnamed: 0,Trader Name,Link to trader
0,Prapor,/wiki/Prapor
1,Therapist,/wiki/Therapist
2,Skier,/wiki/Skier
3,Peacekeeper,/wiki/Peacekeeper
4,Mechanic,/wiki/Mechanic
5,Ragman,/wiki/Ragman
6,Jaeger,/wiki/Jaeger
7,Fence,/wiki/Fence
8,Lightkeeper,/wiki/Lightkeeper


In [7]:
trader_labeling_arr = [None] * len(task_df)
find = 1

for r in range(len(task_df)):
    to_check = int(task_df.iloc[[r]].index[0])
    try:
        if to_check < trader_row_arr[find]:
            trader_labeling_arr[r] = trader_arr[find - 1]
        else:
            find += 1
            trader_labeling_arr[r] = trader_arr[find - 1]
    except:
        trader_labeling_arr[r] = trader_arr[find - 1]

task_df["Trader"] = trader_labeling_arr

In [8]:
remove_arr = [None] * len(trader_row_arr)

for t in range(len(trader_row_arr)):
    remove = trader_row_arr[t] + 1
    remove_arr[t] = remove

task_df.drop(index=remove_arr, axis=1, inplace=True)
task_df.drop(index=trader_row_arr, axis=1, inplace=True)
task_df = task_df.reset_index(drop=True)

In [9]:
comes_from_arr = [[] for _ in range(len(task_df))]
task_name_arr = task_df["Task Name"]

for u in range(len(task_df)):
    current_arr = task_df.iloc[u]["Leads to:"]
    for n in range(len(current_arr)):
        for v in range(len(task_df)):
            if current_arr[n] == task_df.iloc[v]["Task Name"]:
                comes_from_arr[v].append(u)
                break

task_df["Previous task"] = comes_from_arr

In [12]:
task_df.to_json("task.json", orient = "values", index = "true")
trader_df.to_json("trader.json", orient = "values", index = "true")

In [27]:
task_df

Unnamed: 0,Task Name,Link to task,Type,Requirements,Required for Kappa,Leads to:,Level Requirement,Trader,Previous task
0,Shooting Cans,/wiki/Shooting_Cans,Completion,"[Locate the Utyos machine gun on Ground Zero, ...",No,[Debut],,Prapor,[]
1,Debut,/wiki/Debut,Elimination,[Eliminate 5 Scavs all over the Tarkov territo...,Yes,"[Search Mission, Luxurious Life]",,Prapor,[0]
2,Luxurious Life,/wiki/Luxurious_Life,PickUp,"[Locate the liquor store on Ground Zero, Locat...",No,[Background Check],,Prapor,[1]
3,Properties All Around,/wiki/Properties_All_Around,PickUp,[Locate the real estate fund on Streets of Tar...,No,[],,Prapor,[43]
4,Search Mission,/wiki/Search_Mission,Exploration,"[Find Prapor's missing convoy on Woods, Locate...",Yes,[],5.0,Prapor,[1]
5,Background Check,/wiki/Background_Check,PickUp,[Obtain the Bronze pocket watch on Customs(Opt...,Yes,"[Shootout Picnic, Delivery From the Past]",2.0,Prapor,[2]
6,Shootout Picnic,/wiki/Shootout_Picnic,Elimination,[Eliminate 15 Scavs on Woods],Yes,[],3.0,Prapor,[5]
7,Delivery From the Past,/wiki/Delivery_From_the_Past,PickUp,[Obtain the secure folder in the Tarcone Direc...,Yes,[BP Depot],5.0,Prapor,[5]
8,BP Depot,/wiki/BP_Depot,Discovery,[Mark the first fuel tank with an MS2000 Marke...,Yes,"[Bad Rep Evidence, The Bunker - Part 1]",5.0,Prapor,[7]
9,The Bunker - Part 1,/wiki/The_Bunker_-_Part_1,Completion,"[Find the underground bunker, Locate the contr...",Yes,[The Bunker - Part 2],10.0,Prapor,[8]
