In this notebook, we scraped data on historical daily records between Jan. 1, 2019 and July 31, 2021 from [Meteorological Service Singapore’s website](http://www.weather.gov.sg/climate-historical-daily/) for five randomly selected locations in different parts of Singapore, namely Ang Mo Kio, Changi, Clementi, Jurong West, and Newton. This notebook involves the use of Dataiku DSS and hence the import of the dataiku library and saving of dataframes to dataiku datasets. You can ignore the commands involving dataiku library and work using pandas dataframes using python as well. You can read more about the analysis of the scraped data [here](https://blog.dataiku.com/understand-weather-patterns-singapore). 

In [0]:
%pylab inline

In [1]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd 

In [2]:
import calendar

In [3]:
import os

In [4]:
def curl_func(city,mth,year):
    city_str = city.replace(" ",'+')
    city_under = city.replace(" ", "")
    city_under = city_under.replace("(", "")
    city_under = city_under.replace(")", "")
    curl = f"""curl 'http://www.weather.gov.sg/wp-content/themes/wiptheme/page-functions/functions-climate-historical-daily-records.php' \
      -H 'Connection: keep-alive' \
      -H 'Accept: text/plain, */*; q=0.01' \
      -H 'X-Requested-With: XMLHttpRequest' \
      -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' \
      -H 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8' \
      -H 'Origin: http://www.weather.gov.sg' \
      -H 'Referer: http://www.weather.gov.sg/climate-historical-daily/' \
      -H 'Accept-Language: en-GB,en-US;q=0.9,en;q=0.8' \
      -H 'Cookie: TS01d95db5=01f663d54160de22b62b4df265e50aabb5b03374e389de591130026e39da7d1702ba442771df83a1818b41bf2ebce0d8e83bc3887a; _sp_ses.d8ee=*; AMCVS_DF38E5285913269B0A495E5A%40AdobeOrg=1; AMCV_DF38E5285913269B0A495E5A%40AdobeOrg=1075005958%7CMCIDTS%7C18865%7CMCMID%7C70327772364648750790801943301219885273%7CMCAAMLH-1630483004%7C3%7CMCAAMB-1630483004%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1629885404s%7CNONE%7CvVersion%7C4.4.1; _ga=GA1.3.1444993411.1629878205; _gid=GA1.3.436367427.1629878205; _gat=1; _sp_id.d8ee=9f7853bd-4cb0-4f23-b727-f5ef91bad838.1629878204.1.1629878205.1629878204.ef6d82e4-94ca-4724-884c-898503a40a92' \
      --data-raw 'cityname={city_str}&month={mth}&year={year}&redirectUrl=http%3A%2F%2Fwww.weather.gov.sg%2Fweather-world-forecast' \
      --compressed \
      --insecure
      """
    result = os.popen(curl).read()
    df = pd.read_html(result)
    df_final = df[0]
    df_final['year']=year
    df_final['Station']=city
    output_name = city_under+str(mth)+str(year)
    # Get a handle to the current project
    client = dataiku.api_client()
    project = client.get_project(dataiku.default_project_key())
    builder = project.new_managed_dataset_creation_helper(output_name)
    builder.with_store_into("filesystem_managed")
    builder.create() 
    # Write recipe outputs
    dataiku.Dataset(output_name).write_with_schema(df_final)

In [12]:
city="Jurong (West)"

In [13]:
for year in range(2019, 2020):
    if year <= 2020:
        for i in range(1,13):
            mth = calendar.month_name[i]
            curl_func(city,mth,year)
    else:
        for i in range(1,8):
            mth = calendar.month_name[i]
            curl_func(city,mth,year)

31 rows successfully written (O0qGimYDis)
28 rows successfully written (8lvtPiafU3)
31 rows successfully written (KkZribNrf5)
30 rows successfully written (yd9SoJSIgB)
31 rows successfully written (pe56J3NfIK)
30 rows successfully written (E6vduGP7Ia)
31 rows successfully written (UZQMpU7bKF)
31 rows successfully written (9uotTcShyc)
30 rows successfully written (t1fafW7GWS)
31 rows successfully written (82DIoInUef)
30 rows successfully written (rjlVywwFDJ)
31 rows successfully written (kJdmIwtMKr)


### Workings

In [14]:
calendar.month_name[1]

'January'

In [25]:
mth = calendar.month_name[1]
city = "Changi"
year = 2021

In [44]:
month_num =1

In [45]:
output_name = city+str(mth)+str(year)
#output_name = city+str(month_num)+str(year)

In [51]:
output_name

'ChangiJanuary2021'

In [30]:
curl = f"""curl 'http://www.weather.gov.sg/wp-content/themes/wiptheme/page-functions/functions-climate-historical-daily-records.php' \
  -H 'Connection: keep-alive' \
  -H 'Accept: text/plain, */*; q=0.01' \
  -H 'X-Requested-With: XMLHttpRequest' \
  -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' \
  -H 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8' \
  -H 'Origin: http://www.weather.gov.sg' \
  -H 'Referer: http://www.weather.gov.sg/climate-historical-daily/' \
  -H 'Accept-Language: en-GB,en-US;q=0.9,en;q=0.8' \
  -H 'Cookie: TS01d95db5=01f663d54160de22b62b4df265e50aabb5b03374e389de591130026e39da7d1702ba442771df83a1818b41bf2ebce0d8e83bc3887a; _sp_ses.d8ee=*; AMCVS_DF38E5285913269B0A495E5A%40AdobeOrg=1; AMCV_DF38E5285913269B0A495E5A%40AdobeOrg=1075005958%7CMCIDTS%7C18865%7CMCMID%7C70327772364648750790801943301219885273%7CMCAAMLH-1630483004%7C3%7CMCAAMB-1630483004%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1629885404s%7CNONE%7CvVersion%7C4.4.1; _ga=GA1.3.1444993411.1629878205; _gid=GA1.3.436367427.1629878205; _gat=1; _sp_id.d8ee=9f7853bd-4cb0-4f23-b727-f5ef91bad838.1629878204.1.1629878205.1629878204.ef6d82e4-94ca-4724-884c-898503a40a92' \
  --data-raw 'cityname={city}&month={mth}&year={year}&redirectUrl=http%3A%2F%2Fwww.weather.gov.sg%2Fweather-world-forecast' \
  --compressed \
  --insecure
  """
result = os.popen(curl).read()
pd.read_html(result)

[      Date  Daily Rainfall Total (mm)  Highest 30-min Rainfall (mm)  Highest 60-min Rainfall (mm)  Highest 120-min Rainfall (mm)  Mean Temperature (°C)  Maximum Temperature (°C)  Minimum Temperature (°C)  Mean Wind Speed (km/h) Max Wind Speed (km/h)
 0    1 Jan                      134.4                          21.2                          29.6                           45.4                   24.5                      26.0                      21.9                     9.6                  44.4
 1    2 Jan                      210.6                          23.6                          40.2                           61.6                   23.1                      24.1                      21.7                     7.9                  37.0
 2    3 Jan                        5.6                           1.0                           2.0                            2.6                   24.0                      25.0                      21.7                     4.5                  1

In [31]:
df = pd.read_html(result)

In [40]:
df[0]

Unnamed: 0,Date,Daily Rainfall Total (mm),Highest 30-min Rainfall (mm),Highest 60-min Rainfall (mm),Highest 120-min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h),year
0,1 Jan,134.4,21.2,29.6,45.4,24.5,26.0,21.9,9.6,44.4,2021
1,2 Jan,210.6,23.6,40.2,61.6,23.1,24.1,21.7,7.9,37.0,2021
2,3 Jan,5.6,1.0,2.0,2.6,24.0,25.0,21.7,4.5,14.8,2021
3,4 Jan,0.0,0.0,0.0,0.0,25.2,28.0,22.9,5.1,18.5,2021
4,5 Jan,4.2,4.2,4.2,4.2,26.3,31.6,23.8,8.8,29.6,2021
5,6 Jan,0.8,0.6,0.6,0.6,26.6,29.9,24.8,7.3,29.6,2021
6,7 Jan,7.2,2.6,3.6,4.4,24.3,25.3,23.7,5.8,18.5,2021
7,8 Jan,40.6,19.6,21.6,22.6,25.3,28.5,23.4,8.6,37.0,2021
8,9 Jan,23.8,9.8,11.8,13.2,25.6,28.7,23.9,7.9,37.0,2021
9,10 Jan,204.0,22.0,36.8,54.6,23.9,24.6,23.3,8.5,33.3,2021


In [0]:
df_final = df[0]
df_final['year']=year

In [58]:
csv_file = output_name+'.csv'
csv_file

'ChangiJanuary2021.csv'

In [82]:
df_final.head()

Unnamed: 0,Date,Daily Rainfall Total (mm),Highest 30-min Rainfall (mm),Highest 60-min Rainfall (mm),Highest 120-min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h),year
0,1 Jan,134.4,21.2,29.6,45.4,24.5,26.0,21.9,9.6,44.4,2021
1,2 Jan,210.6,23.6,40.2,61.6,23.1,24.1,21.7,7.9,37.0,2021
2,3 Jan,5.6,1.0,2.0,2.6,24.0,25.0,21.7,4.5,14.8,2021
3,4 Jan,0.0,0.0,0.0,0.0,25.2,28.0,22.9,5.1,18.5,2021
4,5 Jan,4.2,4.2,4.2,4.2,26.3,31.6,23.8,8.8,29.6,2021


In [59]:
#df_final.to_csv(csv_file,index=False)

In [52]:
# Get a handle to the current project
client = dataiku.api_client()
project = client.get_project(dataiku.default_project_key())

In [97]:
builder = project.new_managed_dataset_creation_helper(output_name)
builder.with_store_into("filesystem_managed")
builder.create() 

<dataikuapi.dss.dataset.DSSDataset at 0x117eee128>

In [98]:
# Write recipe outputs
dataiku.Dataset(output_name).write_with_schema(df_final)

31 rows successfully written (8TnKbHBldK)


In [90]:
handle = dataiku.Folder("Monthly_Data")

In [91]:
paths = handle.list_paths_in_partition()

In [92]:
paths

['/DAILYDATA_S24_202105.csv',
 '/DAILYDATA_S24_202106.csv',
 '/DAILYDATA_S24_202107.csv']

In [68]:
import shutil

In [84]:
path_to_folder = dataiku.Folder('Monthly_Data').get_path()

In [89]:
list_dir = os.listdir(path_to_folder)
list_dir

['DAILYDATA_S24_202105.csv',
 'DAILYDATA_S24_202106.csv',
 'DAILYDATA_S24_202107.csv']

In [86]:
list_dir[-1]

'Users'

In [80]:
file_path = os.path.join(path_to_folder, list_dir[0])
file_path

'/Users/huixiangchua/Library/DataScienceStudio/dss_home/managed_folders/WEATHERSG/21kNjNKL/ChangiJanuary2021.csv'

In [87]:
file_path = os.path.join(path_to_folder, list_dir[-1])
file_path

'/Users/huixiangchua/Library/DataScienceStudio/dss_home/managed_folders/WEATHERSG/21kNjNKL/Users'

In [83]:
#for deleting file
os.unlink(file_path)

In [88]:
#for deleting folder
shutil.rmtree(file_path)