In [1]:
import pandas as pd
import requests
import pickle
import re
import os

from bs4 import BeautifulSoup
from tqdm import * 

In [2]:
# Directory to save the data in
path_data = r"C:\Users\Nasser Benab\Documents\git\data"

# 1. Sunrise and sunset 

In this section, we extract the sunrise ("fajr") and sunset ("maghrib") times for the countries available in this [website](https://www.islamicfinder.org/world/). 

In [3]:
page = requests.get("https://www.islamicfinder.org/world/")

In [4]:
# Create an instance of the BeautifulSoup class to parse our document
soup = BeautifulSoup(page.content, "html.parser")

In [5]:
# Print the HTML content of the page 
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en">
 <head>
  <script>
   if (self != top || window != top) {
		window.location = "https://islamicfinder.org/access-denied/"
	}
  </script>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="ie=edge" http-equiv="x-ua-compatible">
   <meta content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0" name="viewport">
    <meta content="uu8dgWFUr--b0P1DZ-2JDOb5_vVn03Ek3V-r2s6PFIw" name="google-site-verification"/>
    <meta content="max-age=0" http-equiv="cache-control"/>
    <meta content="no-cache" http-equiv="cache-control"/>
    <meta content="0" http-equiv="expires"/>
    <meta content="Tue, 01 Jan 1980 1:00:00 GMT" http-equiv="expires"/>
    <meta content="no-cache" http-equiv="pragma"/>
    <title>
     Accurate Prayer Times across the world | IslamicFinder
    </title>
    <meta content="Prayer Times, Salah times, Athan (Azan, Adhan, Salah, Salat) &amp; Qiblah. Prayer times of 

In [6]:
# The countries are links in the page (Ex: <a href="/world/south-africa/" title="South Africa">South Africa</a>)
a_tags = soup.find_all("a", href = re.compile("/world/"))

countries_islamicfinder = [instance.get_text() for instance in a_tags 
                           if "title" in instance.attrs.keys()
                           and instance["title"] == instance.get_text() 
                           and instance["title"] != "World"]

# Countries as in the links 
countries_links_islamicfinder = [instance["href"].split("/")[2] for instance in a_tags 
                                       if "title" in instance.attrs.keys()
                                       and instance["title"] == instance.get_text() 
                                       and instance["title"] != "World"]

# Number of countries in islamicfinder
len(countries_islamicfinder)

249

In [7]:
# Subset of the countries
print(countries_islamicfinder[:5])
print(countries_links_islamicfinder[:5])

['Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso']
['algeria', 'angola', 'benin', 'botswana', 'burkina-faso']


In [8]:
# Dictionaries to map each country to its link name and vice versa
country_to_countrylink = dict((zip(countries_islamicfinder, countries_links_islamicfinder)))
countrylink_to_country = dict((zip(countries_links_islamicfinder, countries_islamicfinder)))

Now, let's scrap the "fajr" and "maghrib" times for each country. The information is updated every day.

Each link (Ex: https://www.islamicfinder.org/world/south-africa/) shows the prayer times for the top cities in the country.
We will extract all of them and try to keep only the capitals.

In [9]:
# Dictionary to store the "fajr" and "maghrib" times for each country 
d = {}
for countrylink in tqdm(countries_links_islamicfinder):
    d[countrylink] = {}
    # Country specific link
    page_country = requests.get("https://www.islamicfinder.org/world/{}/".format(countrylink))
    soup_country = BeautifulSoup(page_country.content, "html.parser")
    # Each city (link) data is a row of a table 
    city_tags = [city_a_tag.parent.parent for city_a_tag in soup_country.select("tr td a")]
    for city_tag in city_tags:        
        city_td_tags = city_tag.find_all("td")
        # The city name is in the first cell of the row
        city_name = city_td_tags[0].get_text()
        # The "fajr" time is in the second cell of the row
        fajr_time = city_td_tags[1].get_text()
        # The "maghrib" time is in the sixth cell of the row
        maghrib_time = city_td_tags[5].get_text()
        # Add the times to the dictionary
        d[countrylink][city_name] = (fajr_time, maghrib_time)

100%|██████████| 249/249 [07:56<00:00,  1.84s/it]


In [10]:
d

{'afghanistan': {'Ghazni': ('03:05 AM', '07:03 PM'),
  'Herat': ('03:26 AM', '07:30 PM'),
  'Jalalabad': ('02:53 AM', '06:58 PM'),
  'Kabul': ('02:57 AM', '07:03 PM'),
  'Kandahar': ('03:24 AM', '07:10 PM'),
  'Mazar-i-Sharif': ('02:53 AM', '07:17 PM'),
  'Qandahar': ('03:24 AM', '07:10 PM')},
 'aland': {},
 'albania': {'Durres': ('03:04 AM', '08:11 PM'),
  'Durrës': ('03:05 AM', '08:11 PM'),
  'Shkoder': ('02:59 AM', '08:13 PM'),
  'Tirana': ('03:04 AM', '08:10 PM'),
  'Tirane': ('03:04 AM', '08:10 PM'),
  'Vlore': ('03:12 AM', '08:08 PM'),
  'Vlorë': ('03:11 AM', '08:08 PM')},
 'algeria': {'Algiers': ('03:40 AM', '08:04 PM'),
  'Annaba': ('03:20 AM', '07:45 PM'),
  'Blida': ('03:42 AM', '08:04 PM'),
  'Béjaïa': ('03:32 AM', '07:55 PM'),
  'Constantine': ('03:28 AM', '07:48 PM'),
  'Oran': ('04:00 AM', '08:15 PM'),
  'Skikda': ('03:24 AM', '07:49 PM'),
  'Sétif': ('03:34 AM', '07:53 PM'),
  'Tlemcen': ('04:07 AM', '08:16 PM')},
 'american-samoa': {'American Samoa': ('05:24 AM', '05:58

In [11]:
# Save the data to a pickle file
pickle.dump(d, open(os.path.join(path_data, "sunrise_sunset.p"), "wb"))

In [12]:
# Read data saved on the 31th of May 2017 (~5th day of Ramadan)
d = pickle.load(open(os.path.join(path_data, "sunrise_sunset_31_05_2017.p"), "rb"))

# 2. Countries and their capitals

I have decided to keep one city per country. It had to be the capital. 

Thus, I have scrapped a wikipedia page showing a list of countries and their capitals:

In [13]:
page_capital = requests.get("https://en.wikipedia.org/wiki/List_of_national_capitals_in_alphabetical_order")

In [14]:
soup_capital = BeautifulSoup(page_capital.content, "html.parser")

In [15]:
# Table to scrap
table_tag = soup_capital.find(class_ = "wikitable sortable")

In [16]:
# Rows of the table except the first (the header)
table_tr_tags = table_tag.find_all("tr")[1:]

In [17]:
# Dictionary to store the capital for each country in the wikipedia's page
country_capital_dic = {} 
for i in range(len(table_tr_tags)):
    # The country is in bold in a cell
    country = table_tr_tags[i].select("td b a")[0].get_text()
    # By using ">" (<a> tags directly beneath <td> tags), we select the first cell 
    # of the row (with the capitals) and discard the second cell (with the country)
    capital = table_tr_tags[i].select("td > a")[0].get_text()    
    country_capital_dic[country] = capital

> It can happen that a country has more than one capital. We have selected the first one in the cell.

In [18]:
# Countries from Wikipedia
countries_wikipedia = list(country_capital_dic.keys())
len(countries_wikipedia)

250

# 3. Putting it all together 

For now, we have a number of countries from [*islamicfinder*](https://www.islamicfinder.org/world/) and others from [*wikipedia*](https://en.wikipedia.org/wiki/List_of_national_capitals_in_alphabetical_order). Do they match ?

The same goes for the capitals. But first, is there necessarly the capital in the ["Top cities" of *islamicfinder*](https://www.islamicfinder.org/world/south-africa/) ?

## 3.1. Countries 

In [19]:
# Countries in islamicfinder but not in wikipedia
l = [country for country in countries_islamicfinder if country not in countries_wikipedia]
l

['Congo',
 'Ivory Coast',
 'Antarctica',
 'French Southern Territories',
 'Asia',
 'British Indian Ocean Territory',
 'Cocos [Keeling] Islands',
 'Hashemite Kingdom of Jordan',
 'Macao',
 'Myanmar [Burma]',
 'Palestine',
 'Republic of Korea',
 'Åland',
 'Europe',
 'Macedonia',
 'Republic of Lithuania',
 'Republic of Moldova',
 'Slovak Republic',
 'Svalbard and Jan Mayen',
 'Bonaire, Sint Eustatius, and Saba',
 'Saint-Barthélemy',
 'U.S. Virgin Islands',
 'East Timor',
 'Pitcairn Islands',
 'Tokelau',
 'U.S. Minor Outlying Islands']

I have manually checked the above list to see if the country was just written differently in wikipedia. If not, the country is discarded:

In [20]:
# Countries to be discarded
countries_discarded = ["Antarctica", "French Southern Territories", "Asia", "British Indian Ocean Territory", "Macao",
                      "Åland", "Europe", "Svalbard and Jan Mayen", "Bonaire, Sint Eustatius, and Saba", "Tokelau",
                      "U.S. Minor Outlying Islands"]

In [21]:
# Delete information on these countries
for country in countries_discarded:
    if country in country_to_countrylink.keys():
        del d[country_to_countrylink[country]]

Here, the dictionary maps the country as written in islamicfinder to its equivalent in wikipedia:

In [22]:
islamicfinder_to_wikipedia_dic = {"Congo": "Democratic Republic of the Congo", "Ivory Coast": "Côte d'Ivoire", 
                   "Cocos [Keeling] Islands": "Cocos (Keeling) Islands", "Hashemite Kingdom of Jordan": "Jordan",
                   "Myanmar [Burma]": "Myanmar", "Palestine": "State of Palestine", "Republic of Korea": "South Korea",
                   "Macedonia": "Republic of Macedonia", "Republic of Lithuania": "Lithuania", "Republic of Moldova": "Moldova",
                   "Slovak Republic": "Slovakia", "Saint-Barthélemy": "Saint Barthélemy", 
                   "U.S. Virgin Islands": "United States Virgin Islands", "East Timor": "East Timor (Timor-Leste)", 
                   "Pitcairn Islands": "Pitcairn"}
# Reverse dictionary
wikipedia_to_islamicfinder_dic = {value: key for (key, value) in islamicfinder_to_wikipedia_dic.items()}

The country-capital dictionary built from wikipedia will now be consistent with the information on islamicfinder:

In [23]:
# Replace countries in the country-capital dictionary by their names in islamicfinder
for k, v in wikipedia_to_islamicfinder_dic.items():
    country_capital_dic[v] = country_capital_dic.pop(k)

## 3.2. Capitals 

Let's check if every capital in wikipedia is present in the data scrapped from islamicfinder. If not, either it has been written differently in islamicfinder or it just does not exist. Thus, we will replace the capital in the country-capital dictionary from wikipedia, either by its islamicfinder spelling, or by another city:

In [24]:
# Countries that don't have information on their capitals in islamicfinder
d_capital = d.copy()
i = 1
for countrylink in d.keys():
    country = countrylink_to_country[countrylink]
    # If the wikipedia capital is not in islamicfinder data
    if country_capital_dic[country] not in d[countrylink].keys():
        print("{}. Country: ".format(i), country)
        print("Capital in wikipedia: ", country_capital_dic[country])
        print("Available cities in islamicfinder: ", set(d[countrylink].keys()))
        print("\n")
        i = i + 1

1. Country:  Republic of Korea
Capital in wikipedia:  Seoul
Available cities in islamicfinder:  {'Suwon', 'Pusan', 'Daegu', 'Masan'}


2. Country:  Cocos [Keeling] Islands
Capital in wikipedia:  West Island
Available cities in islamicfinder:  {'Home Island'}


3. Country:  Colombia
Capital in wikipedia:  Bogotá
Available cities in islamicfinder:  {'Medellín', 'Ibague', 'Cúcuta', 'Cali', 'Pereira', 'Barranquilla', 'Medellin', 'Cucuta', 'Bogota D.E.'}


4. Country:  Mongolia
Capital in wikipedia:  Ulaanbaatar
Available cities in islamicfinder:  {'Darhan', 'Ulan Bator'}


5. Country:  Federated States of Micronesia
Capital in wikipedia:  Palikir
Available cities in islamicfinder:  set()


6. Country:  Chad
Capital in wikipedia:  N'Djamena
Available cities in islamicfinder:  {'Moundou', 'Abéché', 'Lai', 'Abeche', 'Koumra', 'Doba', 'Kelo'}


7. Country:  El Salvador
Capital in wikipedia:  San Salvador
Available cities in islamicfinder:  {'San Miguel', 'Santa Ana'}


8. Country:  Singapore
C

This is not the funniest part... We go over the above inconsistencies one by one: 

In [25]:
# Case 1
# Capitals with a different spelling in islamicfinder 
# {country: capital with the islamicfinder spelling}
country_capital_islamicfinder_dic = {"Georgia": "Tblisi", "Hong Kong": "Kowloon", "Colombia": "Bogota D.E.", 
                                 "Hungary": "Kecskemét", "Iceland": "Reykjavik",
                                 "Republic of Moldova": "Chişinău", "New Caledonia": "Noumea", 
                                 "Saint Pierre and Miquelon": "Saint-Pierre", 
                                 "Marshall Islands": "Delap", "Mongolia": "Ulan Bator", "Yemen": "Sanaa", 
                                 "Monaco": "Monaco-Ville", "Mexico": "Mexico",
                                 "Tonga": "Nuku'alofa", "Jersey": "Saint Helier", "Paraguay": "Asuncion", 
                                 "Guernsey": "Saint Peter Port"} 

In [26]:
# Case 2
# Choose another city for islamicfinder countries with no data on their capitals
country_city_islamicfinder_dic = {"Hong Kong": "Kowloon", "Hungary": "Kecskemét", "Nigeria": "Zaria", "Benin": "Cotonou", 
                              "South Georgia and the South Sandwich Islands": "Grytviken", "Grenada": "Grenville", 
                              "Saint Helena": "Georgetown", "Costa Rica": "Limón", "Chad": "Doba", 
                              "Cocos [Keeling] Islands": "Home Island", "Madagascar": "Toamasina",
                              "Antigua and Barbuda": "Piggotts", "Ivory Coast": "Abidjan", "Haiti": "Cap-Haïtien", 
                              "Republic of Lithuania": "Kaunas", "Myanmar [Burma]": "Taunggyi", "Republic of Korea": "Suwon", 
                              "El Salvador": "Santa Ana", "Singapore": "Sentosa", "Serbia": "Čačak", "Palau": "Koror", 
                              "Nauru": "Nauru", "Honduras": "Danli", "Sri Lanka": "Colombo", "Cape Verde": "Mindelo",
                              "Puerto Rico": "Bayam├â┬│n"}
 

In [27]:
country_newcapital_dic = {**country_capital_islamicfinder_dic, **country_city_islamicfinder_dic}

In [28]:
# Case 3
# Countries with no information on islamicfinder
countries_noinfo = ["Curaçao", "Saint-Barthélemy", "Montenegro", "Federated States of Micronesia", "Sint Maarten"]

In [29]:
# Dictionary with the final data
d_capital = d.copy()

for countrylink in d.keys():
    country = countrylink_to_country[countrylink]
    # If the wikipedia capital is not in islamicfinder data
    if country_capital_dic[country] not in d[countrylink].keys():
        # Delete countries with no information on islamicfinder
        if country in countries_noinfo:
            del d_capital[countrylink]
        # Replace the wikipedia capital either by its islamicfinder spelling 
        # or by another city as choosen in country_city_islamicfinder_dic
        elif country in country_newcapital_dic.keys():
            country_capital_dic[country] = country_newcapital_dic[country]       
            

# 4. Fasting data 

Now, we can access the capital (or another city) for every country in islamicfinder

Note that I have manually added the data for the US because Washington D.C was not among the "Top cities" in islamicfinder:

In [30]:
# Dictionary with "fajr" and "maghrib" prayer times for each country 
d_country_prayertimes = {**{countrylink_to_country[country_link]: 
                            d_capital[country_link][country_capital_dic[countrylink_to_country[country_link]]] 
                            for country_link in d_capital.keys() 
                            if country_link != "united-states"}, 
                         **{"United States": ("04:13 AM", "08:28 PM")}}

In [31]:
# Dataframe (country as index)
data = pd.DataFrame.from_dict(d_country_prayertimes).transpose()
data.columns = ["fajr_time", "maghrib_time"]

In [32]:
data.head()

Unnamed: 0,fajr_time,maghrib_time
Afghanistan,02:59 AM,07:01 PM
Albania,03:06 AM,08:07 PM
Algeria,03:42 AM,08:02 PM
American Samoa,05:26 AM,06:01 PM
Andorra,04:11 AM,09:25 PM


In [33]:
# Number of countries in the final data
data.shape[0]

233

In [34]:
# Hours of fasting
data["fasting_time"] = pd.to_datetime(data.maghrib_time) - pd.to_datetime(data.fajr_time)

In [35]:
# Top 10 countries where people fast the longest
data.sort_values(by = "fasting_time", ascending = False)[:10]

Unnamed: 0,fajr_time,maghrib_time,fasting_time
Greenland,02:10 AM,11:32 PM,21:22:00
Iceland,02:11 AM,11:31 PM,21:20:00
Faroe Islands,02:22 AM,10:58 PM,20:36:00
Finland,02:24 AM,10:31 PM,20:07:00
Norway,02:22 AM,10:23 PM,20:01:00
Estonia,02:27 AM,10:24 PM,19:57:00
Sweden,01:55 AM,09:48 PM,19:53:00
Latvia,02:37 AM,10:06 PM,19:29:00
Russia,01:47 AM,09:03 PM,19:16:00
Denmark,02:27 AM,09:41 PM,19:14:00


In [36]:
# Top 10 countries where people fast the shortest
data.sort_values(by = "fasting_time", ascending = True)[:10]

Unnamed: 0,fajr_time,maghrib_time,fasting_time
South Georgia and the South Sandwich Islands,06:24 AM,04:15 PM,09:51:00
Falkland Islands,06:45 AM,04:52 PM,10:07:00
New Zealand,06:00 AM,05:00 PM,11:00:00
Australia,05:34 AM,05:00 PM,11:26:00
Uruguay,06:15 AM,05:42 PM,11:27:00
Argentina,06:23 AM,05:51 PM,11:28:00
Chile,06:11 AM,05:43 PM,11:32:00
Lesotho,05:33 AM,05:20 PM,11:47:00
Norfolk Island,05:10 AM,04:59 PM,11:49:00
Swaziland,05:14 AM,05:12 PM,11:58:00


In [37]:
data.head()

Unnamed: 0,fajr_time,maghrib_time,fasting_time
Afghanistan,02:59 AM,07:01 PM,16:02:00
Albania,03:06 AM,08:07 PM,17:01:00
Algeria,03:42 AM,08:02 PM,16:20:00
American Samoa,05:26 AM,06:01 PM,12:35:00
Andorra,04:11 AM,09:25 PM,17:14:00


In [38]:
# Fasting time in hours
data["fasting_time_h"] = data["fasting_time"].astype("timedelta64[s]") / 3600

In [39]:
data.head()

Unnamed: 0,fajr_time,maghrib_time,fasting_time,fasting_time_h
Afghanistan,02:59 AM,07:01 PM,16:02:00,16.033333
Albania,03:06 AM,08:07 PM,17:01:00,17.016667
Algeria,03:42 AM,08:02 PM,16:20:00,16.333333
American Samoa,05:26 AM,06:01 PM,12:35:00,12.583333
Andorra,04:11 AM,09:25 PM,17:14:00,17.233333


In [40]:
# Save the data
data.to_excel(os.path.join(path_data, "fasting.xlsx"))

> And finally, [here](https://public.tableau.com/profile/nasser.benab#!/vizhome/Wheredopeoplefastthelongestin2017/FastingMap) is a **map** created with **Tableau** showing the fasting times for this data scrapped around the 5th day of Ramadan !