#### Install relevant libraries

In [None]:
!pip install beautifulsoup4
!pip install requests

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd  # import pandas to store data in tables later on

#### Send an HTTP GET request to the URL to download the content of the webpage

In [2]:
url = "https://israelpalestinetimeline.org/children/"
response = requests.get(url)

#### We create a BeautifulSoup object using the BeautifulSoup constructor to parse webpage content

In [3]:
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
soup

<!DOCTYPE html>

<html lang="en-US" xmlns:addthis="https://www.addthis.com/help/api-spec" xmlns:fb="https://www.facebook.com/2008/fbml">
<head>
<!-- Global site tag (gtag.js) - Google Analytics -->
<!-- <script async src="https://www.googletagmanager.com/gtag/js?id=UA-131640601-1"></script>
	<script>
	  window.dataLayer = window.dataLayer || [];
	  function gtag(){dataLayer.push(arguments);}
	  gtag('js', new Date());

	  gtag('config', 'UA-131640601-1');
	</script> -->
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="https://gmpg.org/xfn/11" rel="profile"/>
<link href="https://israelpalestinetimeline.org/xmlrpc.php" rel="pingback"/>
<link href="https://israelpalestinenews.org" rel="canonical"/>
<meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots">
<!-- This site is optimized with the Yoast SEO plugin v22.7 - https://yoast.com/wordpress/plugins/seo/ -->
<title>PHOTOS: Children

In [5]:
# Filter using find_all() and the class attribute
# The data we're interested in is in class="lcp_excerpt"
entries = soup.find_all(class_='lcp_excerpt')

In [6]:
entries

[<span class="lcp_excerpt">September 19, 2023: Rafat Omar Khamayseh, 15, was killed by Israeli soldiers who also killed two other Palestinians ...</span>,
 <span class="lcp_excerpt">September 09, 2023: Milad Monther Al-Ra’ey, 16, was killed by Israeli soldiers at the entrance to the Al-Arroub ...</span>,
 <span class="lcp_excerpt">September 05, 2023: Mohammad Yousef Ismael Zubeidat, 17, was killed by Israeli soldiers north of Jericho in the ...</span>,
 <span class="lcp_excerpt">August 30, 2023: Khaled Samer Al-Za’anin, 14, was shot and killed by Israeli forces at a light rail ...</span>,
 <span class="lcp_excerpt">August 22, 2023: Othman Atef Abu Kharj, 17, was shot and killed by Israeli troops, who shot him ...</span>,
 <span class="lcp_excerpt">August 15, 2023: Qusai Omar al-Walaji, 16, was shot and killed, before dawn during a military incursion into ...</span>,
 <span class="lcp_excerpt">August 6, 2023: Baraa Ahmad Al-Qerem, 16, was killed in an extrajudicial execution (assassinat

In [7]:
type(entries)

bs4.element.ResultSet

#### We can identify the Palestinians and Israelis by using the index. From entries and the website, we see that the Israelis come after Nachman Shmuel Mordoff (included). 

In [8]:
for i,row in enumerate(entries):
    print("row",i,"is",row)

row 0 is <span class="lcp_excerpt">September 19, 2023: Rafat Omar Khamayseh, 15, was killed by Israeli soldiers who also killed two other Palestinians ...</span>
row 1 is <span class="lcp_excerpt">September 09, 2023: Milad Monther Al-Ra’ey, 16, was killed by Israeli soldiers at the entrance to the Al-Arroub ...</span>
row 2 is <span class="lcp_excerpt">September 05, 2023: Mohammad Yousef Ismael Zubeidat, 17, was killed by Israeli soldiers north of Jericho in the ...</span>
row 3 is <span class="lcp_excerpt">August 30, 2023: Khaled Samer Al-Za’anin, 14, was shot and killed by Israeli forces at a light rail ...</span>
row 4 is <span class="lcp_excerpt">August 22, 2023: Othman Atef Abu Kharj, 17, was shot and killed by Israeli troops, who shot him ...</span>
row 5 is <span class="lcp_excerpt">August 15, 2023: Qusai Omar al-Walaji, 16, was shot and killed, before dawn during a military incursion into ...</span>
row 6 is <span class="lcp_excerpt">August 6, 2023: Baraa Ahmad Al-Qerem, 16, wa

#### The Israelis are from row 583 downwards as seen in the website (Nachman Shmuel Mordoff). 
Also, there is a 'Python way' of getting the index but I chose the lazy way today (the data isn't that large).

### Creating two dataframes
Let's separate the entries into Palestinians and Israelis based on the position of "Nachman Shmuel Mordoff" (with index 583) as the boundary, and put this info in a dataframe

In [9]:
import re # We'll use the regex module to search for specific patterns within the extracted text.

In [11]:
# Initialize empty lists to store extracted data
death_dates = []
names = []
ages = []
nationalities = []
boundary_index = 583

# Extract data for Palestinians
for entry in entries[:boundary_index]: # Iterate through the all the rows up till row 582
    text = entry.get_text()
    matches = re.findall(r'(\w+ \d{1,2}, \d{4}): (.+), (\d+)', text)
    # (\w+ \d{1,2}, \d{4}): Matches a date pattern like "Month Day, Year"
    # (.+): Matches one or more of any character (To get the name)
    # (\d+): Matches one or more digits (To get the age)

    if matches:
        death_date, name, age = matches[0]
        nationality = "Palestinian"

        death_dates.append(death_date)
        names.append(name)
        ages.append(age)
        nationalities.append(nationality)

# Extract data for Israelis
for entry in entries[boundary_index:]: # Iterate through the all the rows from row 583 onwards
    text = entry.get_text()
    matches = re.findall(r'(\w+ \d{1,2}, \d{4}): (.+), (\d+)', text)

    if matches:
        death_date, name, age = matches[0]
        nationality = "Israeli"

        death_dates.append(death_date)
        names.append(name)
        ages.append(age)
        nationalities.append(nationality)

    # Create a DataFrame
    data = {
        'death_date': death_dates,
        'name': names,
        'age': ages,
        'Nationality': nationalities
    }

    israeli_palestinian_df = pd.DataFrame(data)

# Print the DataFrame
israeli_palestinian_df.head(10)


Unnamed: 0,death_date,name,age,Nationality
0,"September 19, 2023",Rafat Omar Khamayseh,15,Palestinian
1,"September 09, 2023",Milad Monther Al-Ra’ey,16,Palestinian
2,"September 05, 2023",Mohammad Yousef Ismael Zubeidat,17,Palestinian
3,"August 30, 2023",Khaled Samer Al-Za’anin,14,Palestinian
4,"August 22, 2023",Othman Atef Abu Kharj,17,Palestinian
5,"August 15, 2023",Qusai Omar al-Walaji,16,Palestinian
6,"August 6, 2023",Baraa Ahmad Al-Qerem,16,Palestinian
7,"July 26, 2023",Fares Sharhabeel Abu Samra,14,Palestinian
8,"July 21, 2023",Mohammad Fouad Atta Al-Bayed,17,Palestinian
9,"July 03, 2023",Mustafa Nidal Al-Qassem,17,Palestinian


In [12]:
israeli_palestinian_df.tail(14)

Unnamed: 0,death_date,name,age,Nationality
485,"June 20, 2023",Nachman Shmuel Mordoff,17,Israeli
486,"April 07, 2023",Rina Dee,15,Israeli
487,"February 11, 2023",Asher Menachem Paley,8,Israeli
488,"February 10, 2023",Yaakov Israel Paley,6,Israeli
489,"January 27, 2023",Asher Natan,14,Israeli
490,"November 23, 2022",Aryeh Shechopek,16,Israeli
491,"May 12, 2021","Nadeen Awad, 16, was killed along with her fat...",52,Israeli
492,"May 12, 2021",Ido Avigal,5,Israeli
493,"August 23, 2019","Rina Shnerb, 17, from Lod, was killed, while h...",19,Israeli
494,"June 30, 2016",Hallel Yaffe Ariel,13,Israeli


#### We can also identify the Palestinians and Israelis by listing out the names of all Israelis affected since they are quite few 

In [13]:
# List of Israeli names
israeli_names = [
    "Nachman Shmuel Mordoff",
    "Rina Dee",
    "Asher Menachem Paley",
    "Yaakov Israel Paley",
    "Asher Natan",
    "Aryeh Shechopek",
    "Nadeen Awad",
    "Ido Avigal",
    "Rina Shnerb",
    "Hallel Yaffe Ariel",
    "Chaya Zissel Braun",
    "Daniel Tregerman",
    "Naftali Frankel",
    "Gilad Shaar",
]

# Initialize lists to store extracted data
death_dates = []
names = []
ages = []
nationalities = []


for entry in entries:
    text = entry.get_text()
    matches = re.findall(r'(\w+ \d{1,2}, \d{4}): (.+), (\d+)', text)
    
    if matches:
        death_date, name, age = matches[0]
        
        if any(israeli_name in name for israeli_name in israeli_names):
            nationality = "Israeli"
        else:
            nationality = "Palestinian"
        
        death_dates.append(death_date)
        names.append(name)
        ages.append(age)
        nationalities.append(nationality)

    # Create a DataFrame
    data = {
        'death_date': death_dates,
        'name': names,
        'age': ages,
        'Nationality': nationalities
    }

    israeli_palestinian_df2 = pd.DataFrame(data)
    
    
israeli_palestinian_df2.head()

Unnamed: 0,death_date,name,age,Nationality
0,"September 19, 2023",Rafat Omar Khamayseh,15,Palestinian
1,"September 09, 2023",Milad Monther Al-Ra’ey,16,Palestinian
2,"September 05, 2023",Mohammad Yousef Ismael Zubeidat,17,Palestinian
3,"August 30, 2023",Khaled Samer Al-Za’anin,14,Palestinian
4,"August 22, 2023",Othman Atef Abu Kharj,17,Palestinian


In [14]:
israeli_palestinian_df2.tail(14)

Unnamed: 0,death_date,name,age,Nationality
485,"June 20, 2023",Nachman Shmuel Mordoff,17,Palestinian
486,"April 07, 2023",Rina Dee,15,Israeli
487,"February 11, 2023",Asher Menachem Paley,8,Israeli
488,"February 10, 2023",Yaakov Israel Paley,6,Palestinian
489,"January 27, 2023",Asher Natan,14,Israeli
490,"November 23, 2022",Aryeh Shechopek,16,Israeli
491,"May 12, 2021","Nadeen Awad, 16, was killed along with her fat...",52,Israeli
492,"May 12, 2021",Ido Avigal,5,Israeli
493,"August 23, 2019","Rina Shnerb, 17, from Lod, was killed, while h...",19,Israeli
494,"June 30, 2016",Hallel Yaffe Ariel,13,Israeli


- 2 mistakes in the nationality column among the Israeli rows which can be manually changed.
- I also don't know why we are getting just 495 rows instead of 596 which was what was gotten when I iterated through entries.Something to be looked into.
- death_date column can be further split if needed.