# DIGI405 - Web Scraping Lab


### Extracting text files from webscraper.io CSV results

In [78]:

import re
import pandas as pd
with open('parl.csv', encoding='utf-8') as f:  # change the file name for your file
    df = pd.read_csv(f) # read csv into a pandas dataframe
df.head(245) # display the first five rows of the dataframe
def add_leading_zero(string):
  """
  Adds a leading zero to a string if necessary to make it two digits long.

  Args:
      string (str): The string to be padded.

  Returns:
      str: The padded string.
  """
  if len(string) < 2:
    return f"{string:0>2}"
  else:
    return string
def extract_substring(text):
  """Extracts the substring before /metadata in the given text.

  Args:
    text (str): The input text string.

  Returns:
    str: The extracted substring, or an empty string if not found.
  """

  index = text.rfind("/metadata")  # Find the last occurrence of "/metadata"
  if index != -1:
    substring = text[:index]  # Extract the substring before "/metadata"
    return substring.rsplit("_", 1)[-1]  # Split from the right by last "_" and return the last part
  else:
    return ""  # Return empty string if "/metadata" not found
def extract_number_from_url(url):
  """Extracts a specific number pattern from a URL using regular expressions.

  Args:
    url (str): The URL to extract the number from.

  Returns:
    str: The extracted number if found, otherwise an empty string.
  """

  match = re.search(r"_(\d{8})", url)
  return match.group(1) if match else ""
def month_to_num(month):
  """
  Converts a month name (e.g., "Feb") to its corresponding two-digit numerical representation (e.g., "02").

  Args:
      month (str): The month name to convert.

  Returns:
      str: The two-digit numerical representation of the month, or None if the month name is invalid.
  """
  months = {
      "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05", "Jun": "06",
      "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"
  }
  return months.get(month.capitalize(), None)  # Capitalize the month name for case-insensitivity

def convert_date_format(date_string):
  """
  Converts a date string in the format "Sitting date: DD MMM YYYY" to "YYYY-MM-DD".

  Args:
      date_string (str): The date string to convert.

  Returns:
      str: The converted date string in YYYY-MM-DD format, or None if the format is invalid.
  """
  regex = r"Sitting date: (\d+)\s+([A-Z][a-z]{2})\s+(\d{4})"
  match = re.match(regex, date_string)
  
  if match:
    day, month, year = match.groups()
    #print(year+"-"+month + "-" + day)
    return year+"-"+month_to_num(month) + "-" + add_leading_zero(day) # Pad day with leading zero if necessary
  else:
    return None  # Handle cases where the format doesn't match
def isNaN(num):
    return num != num
df.head(247) # display the first five rows of the dataframe

Unnamed: 0,web-scraper-order,web-scraper-start-url,next,links,texts,data,title,metadata,metadata-href
0,1711181832-1,https://www.parliament.nz/en/pb/hansard-debate...,,,Get notifications\n\t\t\t\n\n\t\t\t\n\n\t\n\t\...,,March 2024,,
1,1711181834-2,https://www.parliament.nz/en/pb/hansard-debate...,https://www.parliament.nz/en/pb/hansard-debate...,,Get notifications\n\t\t\t\n\n\t\t\t\n\n\t\n\t\...,,March 2024,,
2,1711181836-3,https://www.parliament.nz/en/pb/hansard-debate...,https://www.parliament.nz/en/pb/hansard-debate...,,Get notifications\n\t\t\t\n\n\t\t\t\n\n\t\n\t\...,,March 2024,,
3,1711181838-4,https://www.parliament.nz/en/pb/hansard-debate...,https://www.parliament.nz/en/pb/hansard-debate...,,Get notifications\n\t\t\t\n\n\t\t\t\n\n\t\n\t\...,,March 2024,,
4,1711181840-5,https://www.parliament.nz/en/pb/hansard-debate...,https://www.parliament.nz/en/pb/hansard-debate...,,Get notifications\n\t\t\t\n\n\t\t\t\n\n\t\n\t\...,,March 2024,,
...,...,...,...,...,...,...,...,...,...
97,1711182132-98,https://www.parliament.nz/en/pb/hansard-debate...,,https://www.parliament.nz/en/pb/hansard-debate...,Criminal Activity Intervention Legislation Bil...,Sitting date: 28 Mar 2023,Criminal Activity Intervention Legislation Bil...,Metadata,https://www.parliament.nz/en/document/HansS_20...
98,1711182135-99,https://www.parliament.nz/en/pb/hansard-debate...,,https://www.parliament.nz/en/pb/hansard-debate...,Oral Questions — Questions to Ministers\n\t\t\...,Sitting date: 25 Jul 2023,Oral Questions — Questions to Ministers,Metadata,https://www.parliament.nz/en/document/HansS_20...
99,1711182142-100,https://www.parliament.nz/en/pb/hansard-debate...,,https://www.parliament.nz/en/pb/hansard-debate...,Oral Questions — Questions to Ministers\n\t\t\...,Sitting date: 27 Feb 2024,Oral Questions — Questions to Ministers,Metadata,https://www.parliament.nz/en/document/HansS_20...
100,1711182145-101,https://www.parliament.nz/en/pb/hansard-debate...,,https://www.parliament.nz/en/pb/hansard-debate...,Gangs Legislation Amendment Bill — First Readi...,Sitting date: 7 Mar 2024,Gangs Legislation Amendment Bill — First Reading,Metadata,https://www.parliament.nz/en/document/HansS_20...


### Export script

Use the script below to export your scraped content to a directory of text files *if your text column contains plain text*. 

Please note the following:

- this will only work with data in the CSV format exported from webscraper.io
- you should inspect the webscraper output in CSV format first, to save repeating this process if changes are needed
- you must load the CSV into this notebook as a pandas dataframe using the cell above FIRST
- you must create a directory called 'textfiles' in the same directory as this notebook (or wherever you run the code from)

In [79]:
# once your data is loaded in the cell above and you've created a 'textfiles' directory, run this cell    

text_column_name = 'texts' #modify this if your column is named something else
i = 823
for idx, col in df.iterrows():   
    i+= 1
    if isinstance(col['data'],str) and isinstance(col[text_column_name],str):
        numb = extract_substring(col['metadata-href'])
        print(numb)
        
        if isNaN(col['title']):
            filename = 'textfiles/{}.txt'.format(convert_date_format(col['data'][:35]) + "_" + numb)
        else:
            filename = 'textfiles/{}.txt'.format(convert_date_format(col['data'][:35]) + "_" +  numb)
        print(filename)
        with open(filename, 'w', encoding='utf-8') as f:
        # the format(col['title'] bit above determines the output filename - part of the title and the date
            f.write(col[text_column_name])
            print('Writing file ' + str(idx), filename)
    else:
        print('No string data - ignoring row',idx)
            

No string data - ignoring row 0
No string data - ignoring row 1
No string data - ignoring row 2
No string data - ignoring row 3
No string data - ignoring row 4
00000842
textfiles/2003-04-01_00000842.txt
Writing file 5 textfiles/2003-04-01_00000842.txt
00000856
textfiles/2003-05-13_00000856.txt
Writing file 6 textfiles/2003-05-13_00000856.txt
00001178
textfiles/2003-05-13_00001178.txt
Writing file 7 textfiles/2003-05-13_00001178.txt
00000682
textfiles/2003-06-19_00000682.txt
Writing file 8 textfiles/2003-06-19_00000682.txt
00001489
textfiles/2003-08-06_00001489.txt
Writing file 9 textfiles/2003-08-06_00001489.txt
00001200
textfiles/2003-10-14_00001200.txt
Writing file 10 textfiles/2003-10-14_00001200.txt
00001431
textfiles/2003-12-03_00001431.txt
Writing file 11 textfiles/2003-12-03_00001431.txt
00001166
textfiles/2004-04-07_00001166.txt
Writing file 12 textfiles/2004-04-07_00001166.txt
00000896
textfiles/2004-05-18_00000896.txt
Writing file 13 textfiles/2004-05-18_00000896.txt
00001183