In [19]:
import bs4 as bs
import pandas as pd
import requests
import re
import numpy as np

In [2]:
def get_teamnames(year):
    """
    :param year: year as integer
    :return: names of all teams that did participate in "year"
    """
    url = f"https://igem.org/Team_Wikis?year={year}"
    page = requests.get(url)
    soup = bs.BeautifulSoup(page.content,"html.parser")
    teamnames = []
    for a in soup.find_all(href=re.compile(f"{year}.igem.org/Team:")):
        url = a["href"].split("/")
        teamnames.append(url[-1].split(":")[-1])
    return(teamnames)

In [3]:
def return_linklist(year,team):
    """
    :param year: year as integer
    :param team: team name as string
    :return: all links used by judges for this year/team combination
    """
    attributions = f"https://{year}.igem.org/Team:{team}/Attributions"
    description = f"https://{year}.igem.org/Team:{team}/Description"
    contribution = f"https://{year}.igem.org/Team:{team}/Contribution"
    engineering = f"https://{year}.igem.org/Team:{team}/Engineering"
    collaborations = f"https://{year}.igem.org/Team:{team}/Collaborations"
    human_practices = f"https://{year}.igem.org/Team:{team}/Human_Practices"
    implementation = f"https://{year}.igem.org/Team:{team}/Implementation"
    model = f"https://{year}.igem.org/Team:{team}/Model"
    proof_of_concept = f"https://{year}.igem.org/Team:{team}/Proof_Of_Concept"
    partnership = f"https://{year}.igem.org/Team:{team}/Partnership"
    education = f"https://{year}.igem.org/Team:{team}/Education"
    linklist = [attributions,description,contribution,engineering,collaborations,
                human_practices,implementation,model,proof_of_concept,partnership,education]
    return(linklist)

In [4]:
def return_soup(link):
    """
    :param link: link for website that has to be parsed
    :return: parsed website
    """
    page = requests.get(link)
    soup = bs.BeautifulSoup(page.content,"html.parser")
    return(soup)

In [8]:
def get_number_titles_subtitles(soup):
    """
    :param soup: parsed website
    :return: number of titles, subtitles and subsubtitles as tuple
    """
    number_h1 = 0
    number_h2 = 0
    number_h3 = 0
    for tag in soup.find_all("h1"):
        number_h1 += 1
    for tag in soup.find_all("h2"):
        number_h2 += 1
    for tag in soup.find_all("h3"):
        number_h3 += 1
    return(number_h1,number_h2,number_h3)

In [10]:
def get_number_of_pictures(soup):
    """
    :param soup: parsed website
    :return: number of pictures in website
    """
    number_of_pictures = 0
    for tag in soup.find_all("img"):
            number_of_pictures += 1
    return(number_of_pictures)

In [12]:
def get_number_of_pdfs(soup):
    """
    :param soup: parsed website
    :return: number of pdfs in website
    """
    number_of_pdfs = 0
    for tag in soup.find_all("a"):
        if ".pdf" in str(tag):
            number_of_pdfs += 1
    return(number_of_pdfs)

In [14]:
def get_number_of_videos(soup):
    """
    :param soup: parsed website
    :return: number of videos in website
    """
    number_of_videos = 0
    for tag in soup.find_all("video"):
        number_of_videos += 1
    return number_of_videos

In [16]:
def get_sentences(soup):
    """
    :param soup: parsed website
    :return: sentences is website
    """
    text = soup.get_text()
    split = text.split("\n\n")
    only_longer_paragraphs = []
    for paragraph in split:
        if len(paragraph) > 50:
            only_longer_paragraphs.append(paragraph)
    sentences = []
    for paragraph in only_longer_paragraphs:
        for sentence in paragraph.split("."):
            sentences.append(sentence)
    return sentences

In [17]:
def get_number_links(soup):
    """
    :param soup: parsed website
    :return: number of links that link inside the wiki and outside the wiki as tuple
    """
    number_internal = 0
    number_external = 0
    for link in soup.find_all("a"):
        link_temp = link.get("href")
        if link_temp == None:
            pass
        elif ".igem.org/Team:" in link_temp:
            number_internal += 1
        elif "http://" or "https://" in link_temp:
            number_external += 1
        else:
            print("Not all lnks could be processed")
    return(number_internal, number_external)

In [18]:
master_dataframe = pd.DataFrame(columns=["Year","Team","Winner","Number Titles",
                                         "Number Subtitles","Number SubSubtitles",
                                         "Number Pictures","Number PDFs","Number Videos",
                                         "Mean Characters per Sentence","Mean Words per Sentence",
                                         "Number internal links","Number extarnal links"])

In [22]:
years = [2017,2018,2019]

In [21]:
# This takes a while...
print("Calculating effort...")
total_number = 0
for year in years:
    for team in get_teamnames(year):
        total_number += 1
print("Calculation finished.")
print(f"{total_number} entries to process.")
counter = 0
print("0%")
for year in years:
    for team in get_teamnames(year):
        winner = False
        h1 = 0
        h2 = 0
        h3 = 0
        pictures = 0
        pdfs = 0
        videos = 0
        all_sentences = []
        number_internal_links = 0
        number_external_links = 0
        for link in return_linklist(year,team):
            soup = return_soup(link)
            h1_temp, h2_temp, h3_temp = get_number_titles_subtitles(soup)
            h1 += h1_temp
            h2 += h2_temp
            h3 += h3_temp
            pictures += get_number_of_pictures(soup)
            pdfs += get_number_of_pdfs(soup)
            videos += get_number_of_videos(soup)
            number_internal_links_temp, number_external_links_temp = get_number_links(soup)
            number_internal_links += number_internal_links_temp
            number_external_links += number_external_links_temp
            for sentence in get_sentences(soup):
                all_sentences.append(sentence)
        sentence_character_list = []
        for sentence in all_sentences:
            sentence_character_list.append(len(sentence))
        mean_characters_per_sentence = round(np.mean(sentence_character_list))
        sentence_word_list = []
        for sentence in all_sentences:
            sentence = str(sentence)
            word_list = sentence.split(" ")
            sentence_word_list.append(len(word_list))
        mean_words_per_sentence = round(np.mean(sentence_word_list))
        dataframe_temp = pd.DataFrame([[year,team,winner,h1,h2,h3,
                                        pictures,pdfs,videos,
                                        mean_characters_per_sentence,
                                        mean_words_per_sentence,
                                        number_internal_links,
                                        number_external_links]],
                                      columns=["Year","Team","Winner",
                                               "Number Titles",
                                               "Number Subtitles",
                                               "Number SubSubtitles",
                                               "Number Pictures",
                                               "Number PDFs",
                                               "Number Videos",
                                               "Mean Characters per Sentence",
                                               "Mean Words per Sentence",
                                               "Number internal links",
                                               "Number extarnal links"])
        master_dataframe = master_dataframe.append(dataframe_temp,ignore_index = True)
        counter += 1
        if counter%10 == 0:
            print(f"{round((counter/total_number)*100)}%")
print("100%")
print("Done.")

Calculating effort...
Calculation finished.
951 entries to process.
0%
1%
2%
3%
4%
5%
6%
7%
8%
9%
11%
12%
13%
14%
15%
16%
17%
18%
19%
20%
21%
22%
23%
24%
25%
26%
27%
28%
29%
30%
32%
33%
34%
35%
36%
37%
38%
39%
40%
41%
42%
43%
44%
45%
46%
47%
48%
49%
50%
52%
53%
54%
55%
56%
57%
58%
59%
60%
61%
62%
63%
64%
65%
66%
67%
68%
69%
70%
72%
73%
74%
75%
76%
77%
78%
79%
80%
81%
82%
83%
84%
85%
86%
87%
88%
89%
90%
91%
93%
94%
95%
96%
97%
98%
99%
100%
100%
Done.


In [37]:
master_dataframe.to_csv("master_dataframe.csv",index=False,sep=";")

NameError: name 'master_dataframe' is not defined

In [79]:
master_dataframe = pd.read_csv("master_dataframe.csv", sep = ";")

In [80]:
additional_link = "https://igem.org/Team_List.cgi?year=all&team_list_download=1&show_all=0"

In [81]:
r = requests.get(additional_link, allow_redirects = True)
open("test.csv","wb").write(r.content)

237999

In [82]:
additional_dataframe = pd.read_csv("test.csv")
additional_dataframe.columns = additional_dataframe.columns.str.replace(' ','')
additional_dataframe = additional_dataframe.replace(" ","",regex=True)
additional_dataframe = additional_dataframe.replace("_"," ",regex=True)
additional_dataframe = additional_dataframe.replace("High school","Highschool",regex=True)
additional_dataframe.head(n=50)
additional_dataframe.columns

Index(['TeamID', 'Team', 'Region', 'Country', 'Track', 'Section', 'Size',
       'Status', 'Year'],
      dtype='object')

In [86]:
master_dataframe["Region"] = "Region"
master_dataframe["Country"] = "Country"
master_dataframe["Track"] = "Track"
master_dataframe["Section"] = "Section"
master_dataframe["Size"] = np.nan

In [88]:
for index in master_dataframe.index:
    year = master_dataframe.at[index,"Year"]
    team = master_dataframe.at[index,"Team"]
    index_additional = additional_dataframe[(additional_dataframe["Year"] == year) & 
                                            (additional_dataframe["Team"] == team)].index
    region = additional_dataframe.loc[index_additional,"Region"]
    country = additional_dataframe.loc[index_additional,"Country"]
    track = additional_dataframe.loc[index_additional,"Track"]
    section = additional_dataframe.loc[index_additional,"Section"]
    size = additional_dataframe.loc[index_additional,"Size"]
    master_dataframe.at[index,"Region"] = region.item()
    master_dataframe.at[index,"Country"] = country.item()
    master_dataframe.at[index,"Track"] = track.item()
    master_dataframe.at[index,"Section"] = section.item()
    master_dataframe.at[index,"Size"] = size.item()

In [90]:
master_dataframe.to_csv("master_dataframe_extended.csv",index=False,sep=";")

In [None]:
master_dataframe_extended = pd.read_csv("master_dataframe_extended.csv", sep = ";")


In [None]:
def make_winner(year,team):
    """
    Changes specific cell in column "Winner" from False to True
    :param year: year the team participated
    :param team: teamname
    :return: none
    """
    index = master_dataframe_extended.index[(master_dataframe_extended['Year'] == int(year)) & 
                                            (master_dataframe_extended["Team"] == str(team))]
    master_dataframe_extended.at[index,"Winner"] = True