# Step 1. Import data from the webscraping

In [1]:
import pandas as pd
import glob
import os

# Get the parent directory path
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

# Find all files in the parent directory matching the webscrape pattern
csv_pattern = os.path.join(parent_dir, "webscrape_*.csv")
csv_files = glob.glob(csv_pattern)

# Store column headers for each file
all_headers = {}
consistent = True
first_file = None

print(f"Found {len(csv_files)} webscrape CSV files")

# Loop through each file and get headers
for file in csv_files:
    filename = os.path.basename(file)
    
    # Just read the headers without loading entire file
    headers = pd.read_csv(file, nrows=0).columns.tolist()
    all_headers[filename] = headers
    
    # Store first file's headers as reference
    if first_file is None:
        first_file = filename
        reference_headers = headers
        print(f"Reference headers from {first_file}: {reference_headers}")
    # Compare current file with reference
    elif headers != reference_headers:
        consistent = False
        print(f"\nMISMATCH in {filename}:")
        
        # Find and show differences
        missing = set(reference_headers) - set(headers)
        extra = set(headers) - set(reference_headers)
        
        if missing:
            print(f"  Missing columns: {missing}")
        if extra:
            print(f"  Extra columns: {extra}")

# Print final result
if consistent:
    print("\nAll files have identical column headers")
else:
    print("\nWarning: Column headers differ between files")

Found 73 webscrape CSV files
Reference headers from webscrape_2023-08-15_09-20-02.csv: ['title', 'company', 'location', 'salary', 'description']

All files have identical column headers


# Step 2. Add date and date-times so we can identify where each job came from

In [2]:
# Create DataFrames with dates extracted from filenames
import pandas as pd
import re
from datetime import datetime

# Dictionary to store all DataFrames
dataframes = {}

# Regular expression to extract date and time from filename
date_pattern = r'webscrape_(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})\.csv'

# Process each CSV file
for file in csv_files:
    filename = os.path.basename(file)
    
    # Extract date using regex
    match = re.search(date_pattern, filename)
    if match:
        date_str = match.group(1)  # YYYY-MM-DD
        time_str = match.group(2).replace('-', ':')  # Convert to HH:MM:SS
        datetime_str = f"{date_str} {time_str}"
        
        # Parse to datetime object
        file_datetime = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
        
        # Read the CSV
        df = pd.read_csv(file)
        
        # Add date columns
        df['scrape_date'] = file_datetime.date()
        df['scrape_datetime'] = file_datetime
        
        # Store in dictionary
        dataframes[filename] = df
        
        print(f"Loaded {filename} with {len(df)} rows, scraped on {file_datetime}")
    else:
        print(f"Warning: Couldn't extract date from {filename}")

print(f"\nProcessed {len(dataframes)} files")

# Example to verify date columns were added correctly
if dataframes:
    first_key = list(dataframes.keys())[0]
    print(f"\nExample from {first_key}:")
    print(dataframes[first_key][['scrape_date', 'scrape_datetime']].head(2))

Loaded webscrape_2023-08-15_09-20-02.csv with 3923 rows, scraped on 2023-08-15 09:20:02
Loaded webscrape_2023-08-23_09-55-48.csv with 4151 rows, scraped on 2023-08-23 09:55:48
Loaded webscrape_2023-08-30_10-13-36.csv with 3897 rows, scraped on 2023-08-30 10:13:36
Loaded webscrape_2023-09-07_09-32-27.csv with 4266 rows, scraped on 2023-09-07 09:32:27
Loaded webscrape_2023-09-27_11-14-05.csv with 3458 rows, scraped on 2023-09-27 11:14:05
Loaded webscrape_2023-10-04_09-34-49.csv with 3712 rows, scraped on 2023-10-04 09:34:49
Loaded webscrape_2023-10-11_07-59-43.csv with 3362 rows, scraped on 2023-10-11 07:59:43
Loaded webscrape_2023-10-18_09-22-00.csv with 3899 rows, scraped on 2023-10-18 09:22:00
Loaded webscrape_2023-11-01_08-31-18.csv with 4629 rows, scraped on 2023-11-01 08:31:18
Loaded webscrape_2023-11-08_10-36-53.csv with 1859 rows, scraped on 2023-11-08 10:36:53
Loaded webscrape_2023-11-22_10-11-56.csv with 2762 rows, scraped on 2023-11-22 10:11:56
Loaded webscrape_2023-11-29_10-0

# Step 3. Merge the webscrape CSVs

In [3]:
# Merge all DataFrames into one combined dataset
import pandas as pd

# Check if we have DataFrames to merge
if not dataframes:
    print("No DataFrames to merge!")
else:
    # Combine all DataFrames from the dictionary
    combined_df = pd.concat(dataframes.values(), ignore_index=True)
    
    # Basic info about the combined dataset
    print(f"Combined dataset created successfully!")
    print(f"Total rows: {len(combined_df):,}")
    print(f"Total columns: {len(combined_df.columns)}")
    
    # Show unique scrape dates to verify we have data from different days
    unique_dates = combined_df['scrape_date'].unique()
    print(f"\nData includes {len(unique_dates)} unique dates:")
    for date in sorted(unique_dates):
        count = len(combined_df[combined_df['scrape_date'] == date])
        print(f"  {date}: {count:,} rows")
    
    # Preview of the combined data
    print("\nPreview of combined dataset:")
    # Show date columns first, then a few others
    preview_cols = ['scrape_date', 'scrape_datetime'] 
    # Add a few more columns for preview (adjust as needed)
    other_cols = [col for col in combined_df.columns 
                 if col not in ['scrape_date', 'scrape_datetime']][:3]
    preview_cols.extend(other_cols)
    
    print(combined_df[preview_cols].head())

Combined dataset created successfully!
Total rows: 253,897
Total columns: 7

Data includes 72 unique dates:
  2023-08-15: 3,923 rows
  2023-08-23: 4,151 rows
  2023-08-30: 3,897 rows
  2023-09-07: 4,266 rows
  2023-09-27: 3,458 rows
  2023-10-04: 3,712 rows
  2023-10-11: 3,362 rows
  2023-10-18: 3,899 rows
  2023-11-01: 4,629 rows
  2023-11-08: 1,859 rows
  2023-11-22: 2,762 rows
  2023-11-29: 2,931 rows
  2023-12-06: 3,958 rows
  2023-12-13: 4,141 rows
  2023-12-20: 3,775 rows
  2024-01-03: 3,193 rows
  2024-01-10: 4,081 rows
  2024-01-17: 3,630 rows
  2024-01-31: 3,239 rows
  2024-02-07: 3,268 rows
  2024-02-14: 3,427 rows
  2024-02-21: 3,644 rows
  2024-03-13: 3,050 rows
  2024-03-20: 3,491 rows
  2024-03-27: 2,945 rows
  2024-04-10: 3,504 rows
  2024-04-17: 4,330 rows
  2024-04-24: 3,500 rows
  2024-05-01: 2,978 rows
  2024-05-08: 3,199 rows
  2024-05-15: 2,871 rows
  2024-05-22: 2,951 rows
  2024-05-29: 2,248 rows
  2024-06-05: 2,983 rows
  2024-06-12: 2,814 rows
  2024-06-19: 2,5

# Step 4. Tidying data.

## 4.1. Removing rows that aren't jobs.

### 4.1.1. Where 'illustration of bank notes' in job title

In [4]:
# Remove rows where title contains "Illustration of banknotes" with flexible matching

# First check if title column exists
if "title" not in combined_df.columns:
    print("Warning: 'title' column not found in the dataset.")
else:
    # Display some titles to understand what we're working with
    print("Sample of titles in the dataset:")
    sample_titles = combined_df["title"].dropna().sample(min(5, len(combined_df))).tolist()
    for title in sample_titles:
        print(f"  - {title}")
    
    # Count original rows
    original_count = len(combined_df)
    
    # Look at potential matches more flexibly
    potential_matches = combined_df[combined_df["title"].str.contains("Illustration of banknote", case=False, na=False)]
    print(f"\nFound {len(potential_matches)} potential matches with 'Illustration of banknote'")
    
    if len(potential_matches) > 0:
        print("\nSample of matching titles:")
        for title in potential_matches["title"].unique()[:5]:
            print(f"  - {title}")
    
    # Remove rows with flexible matching
    filtered_df = combined_df[~combined_df["title"].str.contains("Illustration of banknote", case=False, na=False)]
    
    # Print results
    print(f"\nOriginal dataset: {original_count:,} rows")
    print(f"Removed: {original_count - len(filtered_df):,} rows")
    print(f"Filtered dataset: {len(filtered_df):,} rows")
    
    # Replace original with filtered data
    combined_df = filtered_df.copy()

Sample of titles in the dataset:
  -                     First Contact Practitioner (Podiatrist)
  -                     Registered Manager
  -                     Deputy Nursing Home Manager
  -                     Paramedic
  -                     Registered Nurse

Found 4659 potential matches with 'Illustration of banknote'

Sample of matching titles:
  -                 Illustration of banknotes
  - 
                Illustration of banknotes

Original dataset: 253,897 rows
Removed: 4,659 rows
Filtered dataset: 249,238 rows


### 4.1.2. Where company includes 'boost your CV'

In [5]:
# Remove rows where company contains "Boost your CV"

# First check if company column exists
if "company" not in combined_df.columns:
    print("Warning: 'company' column not found in the dataset.")
else:
    # Count original rows
    original_count = len(combined_df)
    
    # Look at potential matches more flexibly
    potential_matches = combined_df[combined_df["company"].str.contains("Boost your CV", case=False, na=False)]
    print(f"\nFound {len(potential_matches)} potential matches with 'Boost your CV'")
    
    if len(potential_matches) > 0:
        print("\nSample of matching company entries:")
        for company in potential_matches["company"].unique()[:3]:
            print(f"  - {company}")
    
    # Remove rows with flexible matching
    filtered_df = combined_df[~combined_df["company"].str.contains("Boost your CV", case=False, na=False)]
    
    # Print results
    print(f"\nOriginal dataset: {original_count:,} rows")
    print(f"Removed: {original_count - len(filtered_df):,} rows")
    print(f"Filtered dataset: {len(filtered_df):,} rows")
    
    # Replace original with filtered data
    combined_df = filtered_df.copy()


Found 3500 potential matches with 'Boost your CV'

Sample of matching company entries:
  -         Boost your CV
        It takes 2 minutes and it's free.Try ValueMyCV now ❯

Original dataset: 249,238 rows
Removed: 3,500 rows
Filtered dataset: 245,738 rows


### 4.1.3. Where company contains 'recent the newest jobs for this search'

In [6]:
# Remove rows where company contains "Receive the newest jobs"

# First check if company column exists
if "company" not in combined_df.columns:
    print("Warning: 'company' column not found in the dataset.")
else:
    # Count original rows
    original_count = len(combined_df)
    
    # Look at potential matches with flexible matching
    search_phrase = "Receive the newest jobs"
    potential_matches = combined_df[combined_df["company"].str.contains(search_phrase, case=False, na=False)]
    print(f"\nFound {len(potential_matches)} potential matches with '{search_phrase}'")
    
    if len(potential_matches) > 0:
        print("\nSample of matching company entries:")
        for company in potential_matches["company"].unique()[:3]:
            print(f"  - {company}")
    
    # Remove rows with flexible matching
    filtered_df = combined_df[~combined_df["company"].str.contains(search_phrase, case=False, na=False)]
    
    # Print results
    print(f"\nOriginal dataset: {original_count:,} rows")
    print(f"Removed: {original_count - len(filtered_df):,} rows")
    print(f"Filtered dataset: {len(filtered_df):,} rows")
    
    # Replace original with filtered data
    combined_df = filtered_df.copy()


Found 5608 potential matches with 'Receive the newest jobs'

Sample of matching company entries:
  -             Receive the newest jobs for this search by email:

Original dataset: 245,738 rows
Removed: 5,608 rows
Filtered dataset: 240,130 rows


### Step 4.1.4. Remove missing salaries

##### *Preliminary check*

We first check to see if any of these rows actually contain the salary in the job description. Our analysis below shows that:
1. All of the rows that have a £ sign in the job title already have a salary in the salary, so
2. Missing salaries are not found in the job title, and
3. Where a salary is cited in the job title *and* the salary field, these are usually consistent. They differ usually by unit (i.e. £11p/h becomes £11 in the salary field)

In [7]:
# Updated salary extraction with improved regex patterns

# First check if combined_df exists
if "title" not in combined_df.columns or "salary" not in combined_df.columns:
    print("Warning: Missing required columns")
else:
    # Create DF with rows that have '£' in title AND non-null salary field
    pound_in_title = combined_df["title"].str.contains("£", na=False)
    has_salary_value = ~combined_df["salary"].isna()
    comparison_df = combined_df[pound_in_title & has_salary_value].copy()
    
    print(f"Found {len(comparison_df):,} rows with '£' in title AND salary field populated")
    
    # Improved function to extract salary from title
    def extract_salary_from_title(title):
        # Updated patterns with optional space after £
        patterns = [
            r'£\s*(\d{1,3}[,.]?\d{0,3}[kK]?)', # Basic amounts like £50k, £ 50,000
            r'£\s*(\d{1,3}[,.]?\d{0,3})-£?\s*(\d{1,3}[,.]?\d{0,3}[kK]?)', # Ranges like £30-£40k or £ 30-£ 40k
            r'£\s*(\d{1,3}[,.]?\d{0,3}[kK]?)[/\s]*(day|hour|month|year|pa|p\.a\.)', # With time period
            r'£\s*(\d{1,3}[,.]?\d{0,3})[kK]\s*-\s*£?\s*(\d{1,3}[,.]?\d{0,3})[kK]', # Format like £24K-£32K
        ]
        
        for pattern in patterns:
            match = re.search(pattern, title)
            if match:
                return match.group(0)
        return None
    
    # Add extracted salary column
    comparison_df['title_salary'] = comparison_df['title'].apply(extract_salary_from_title)
    
    # Count how many titles we could extract salaries from
    extracted_count = comparison_df['title_salary'].notna().sum()
    print(f"Successfully extracted salary from {extracted_count} titles ({extracted_count/len(comparison_df)*100:.1f}%)")
    
    # Check the previously failed cases
    try:
        failed_before = failed_extraction_df.copy()
        failed_before['updated_title_salary'] = failed_before['title'].apply(extract_salary_from_title)
        
        newly_extracted = failed_before['updated_title_salary'].notna().sum()
        print(f"\nOf the {len(failed_before)} previously failed extractions:")
        print(f"  - Now successfully extracted: {newly_extracted} ({newly_extracted/len(failed_before)*100:.1f}%)")
        
        if newly_extracted > 0:
            print("\nNewly extracted examples:")
            for _, row in failed_before[failed_before['updated_title_salary'].notna()].iterrows():
                print(f"Title: {row['title']}")
                print(f"  - Extracted salary: {row['updated_title_salary']}")
                print(f"  - Salary field: {row['salary']}")
                print()
    except NameError:
        print("\nPreviously failed cases dataset not available.")
    
    # Get current failed extractions
    failed_now = comparison_df[comparison_df['title_salary'].isna() & comparison_df['title'].str.contains('£', na=False)]
    print(f"\nStill failed to extract from {len(failed_now)} titles")
    
    if len(failed_now) > 0 and len(failed_now) <= 20:
        print("\nRemaining failure cases:")
        for _, row in failed_now.iterrows():
            print(f"Title: {row['title']}")
            print(f"Salary field: {row['salary']}")
            print()

Found 4,575 rows with '£' in title AND salary field populated
Successfully extracted salary from 4564 titles (99.8%)

Previously failed cases dataset not available.

Still failed to extract from 11 titles

Remaining failure cases:
Title:                     Vet Surgeon Vacancy - First Opinion Small Animal - Manchester Central Area (to £
Salary field: £65

Title:                     Vet Surgeon Vacancy - First Opinion Small Animal - Manchester Central Area (to £
Salary field: £65

Title:                     Post Market Surveillance Manager - Oxford - £Competitive
Salary field: £65,000

Title:                     Post Market Surveillance Manager - Oxford - £Competitive
Salary field: £60,000

Title:                     Vet Surgeon Vacancy - First Opinion Small Animal - Manchester Central Area (to £
Salary field: £65

Title:                     Post Market Surveillance Manager - Oxford - £Competitive
Salary field: £60,000

Title:                     Post Market Surveillance Manager - Oxfor

In [8]:
# Compare extracted salary from title with salary field

# First ensure we have the updated extracted salary data
if 'comparison_df' not in locals() or 'title_salary' not in comparison_df.columns:
    print("Error: Please run the updated salary extraction code first")
else:
    # Focus only on rows where we successfully extracted a salary
    valid_comparisons = comparison_df[comparison_df['title_salary'].notna()].copy()
    
    print(f"Analyzing {len(valid_comparisons)} rows with successfully extracted title salaries")
    
    # Function to normalize salary for comparison
    def normalize_salary(salary_text):
        # Convert to string if not already
        salary_str = str(salary_text).lower()
        # Remove spaces and common separators
        salary_str = salary_str.replace(' ', '').replace(',', '')
        return salary_str
    
    # Add normalized versions of both salary fields
    valid_comparisons['norm_title_salary'] = valid_comparisons['title_salary'].apply(normalize_salary)
    valid_comparisons['norm_field_salary'] = valid_comparisons['salary'].apply(normalize_salary)
    
    # Check for matches (using contains rather than exact match)
    valid_comparisons['exact_match'] = valid_comparisons['norm_title_salary'] == valid_comparisons['norm_field_salary']
    valid_comparisons['partial_match'] = valid_comparisons.apply(
        lambda row: (row['norm_title_salary'] in row['norm_field_salary'] or 
                     row['norm_field_salary'] in row['norm_title_salary']), 
        axis=1
    )
    
    # Count match types
    exact_matches = valid_comparisons['exact_match'].sum()
    partial_matches = valid_comparisons['partial_match'].sum() - exact_matches  # Remove overlap with exact
    no_matches = len(valid_comparisons) - exact_matches - partial_matches
    
    # Calculate percentages
    total = len(valid_comparisons)
    exact_pct = (exact_matches / total) * 100
    partial_pct = (partial_matches / total) * 100
    no_match_pct = (no_matches / total) * 100
    
    # Print match statistics
    print("\nSalary match analysis:")
    print(f"  - Exact matches: {exact_matches:,} ({exact_pct:.1f}%)")
    print(f"  - Partial matches: {partial_matches:,} ({partial_pct:.1f}%)")
    print(f"  - No match: {no_matches:,} ({no_match_pct:.1f}%)")
    
    # Show examples of each type
    print("\nEXAMPLES OF EXACT MATCHES:")
    for _, row in valid_comparisons[valid_comparisons['exact_match']].head(3).iterrows():
        print(f"Title: {row['title']}")
        print(f"  - Title salary: {row['title_salary']}")
        print(f"  - Field salary: {row['salary']}")
        print()
    
    print("\nEXAMPLES OF PARTIAL MATCHES:")
    for _, row in valid_comparisons[valid_comparisons['partial_match'] & ~valid_comparisons['exact_match']].head(3).iterrows():
        print(f"Title: {row['title']}")
        print(f"  - Title salary: {row['title_salary']}")
        print(f"  - Field salary: {row['salary']}")
        print()
    
    print("\nEXAMPLES OF NO MATCHES:")
    for _, row in valid_comparisons[~valid_comparisons['partial_match'] & ~valid_comparisons['exact_match']].head(3).iterrows():
        print(f"Title: {row['title']}")
        print(f"  - Title salary: {row['title_salary']}")
        print(f"  - Field salary: {row['salary']}")
        print()

Analyzing 4564 rows with successfully extracted title salaries

Salary match analysis:
  - Exact matches: 1,424 (31.2%)
  - Partial matches: 3,140 (68.8%)
  - No match: 0 (0.0%)

EXAMPLES OF EXACT MATCHES:
Title:                     Associate Dentist - £14-15 per UDA
  - Title salary: £14
  - Field salary: £14

Title:                     Associate Dentist - Up to £14 per UDA
  - Title salary: £14
  - Field salary: £14

Title:                     Dental Nurse - £12 per hour
  - Title salary: £12
  - Field salary: £12


EXAMPLES OF PARTIAL MATCHES:
Title:                     Newly Qualified Registered Nurse (RGN/RMN/RNLD) – Preston – Nursing Home Setting – £19.00 per hour – Full Training and Support Provided.
  - Title salary: £19.00
  - Field salary: £19

Title:                     Vet Surgeon Vacancy - small animal & exotics - Wirral (to £70K)
  - Title salary: £70K
  - Field salary: £70

Title:                     Newly Qualified Registered Nurse (RGN/RMN/RNLD) – Burnley – Nursing Hom

##### *Removal*

In [9]:
# Remove rows where 'salary' is missing (now that we've verified salary data quality)

# First check if salary column exists
if "salary" not in combined_df.columns:
    print("Warning: 'salary' column not found in the dataset.")
else:
    # Count original rows
    original_count = len(combined_df)
    
    # Check how many rows have missing salary values
    missing_salary_count = combined_df["salary"].isna().sum()
    missing_percent = (missing_salary_count / original_count) * 100
    
    print(f"Found {missing_salary_count:,} rows ({missing_percent:.1f}%) with missing salary values")
    
    # Drop rows where salary is missing
    filtered_df = combined_df.dropna(subset=["salary"])
    
    # Print results
    print(f"\nOriginal dataset: {original_count:,} rows")
    print(f"Removed: {missing_salary_count:,} rows ({missing_percent:.1f}%)")
    print(f"Filtered dataset: {len(filtered_df):,} rows")
    
    # Replace original with filtered data
    combined_df = filtered_df.copy()
    
    print("\nSalary column statistics after cleaning:")
    # Check null values (should be zero now)
    null_count = combined_df["salary"].isna().sum()
    print(f"  - Null values: {null_count}")
    
    # Check unique value count
    unique_count = combined_df["salary"].nunique()
    print(f"  - Unique values: {unique_count:,}")
    
    # Sample values
    print("\nSample salary values:")
    for value in combined_df["salary"].sample(5).tolist():
        print(f"  - {value}")

Found 11,301 rows (4.7%) with missing salary values

Original dataset: 240,130 rows
Removed: 11,301 rows (4.7%)
Filtered dataset: 228,829 rows

Salary column statistics after cleaning:
  - Null values: 0
  - Unique values: 32,539

Sample salary values:
  - £11
  - £40,000
  - £12
  - £35,000
  - £45,000


In [10]:
# Analyze 'TOP MATCH' salary rows and check for '£' in description

# First check if necessary columns exist
if "salary" not in combined_df.columns:
    print("Warning: 'salary' column not found in the dataset.")
elif "description" not in combined_df.columns:
    print("Warning: 'description' column not found in the dataset.")
else:
    # Find all rows where salary is 'TOP MATCH'
    top_match_mask = combined_df["salary"] == "TOP MATCH"
    top_match_df = combined_df[top_match_mask]
    
    top_match_count = len(top_match_df)
    top_match_percent = (top_match_count / len(combined_df)) * 100
    
    print(f"Found {top_match_count:,} rows ({top_match_percent:.1f}%) with 'TOP MATCH' as salary")
    
    # Check how many have '£' in description
    has_pound_in_desc = top_match_df["description"].str.contains("£", na=False)
    pound_count = has_pound_in_desc.sum()
    pound_percent = (pound_count / top_match_count) * 100 if top_match_count > 0 else 0
    
    print(f"\nOf these 'TOP MATCH' rows:")
    print(f"  - {pound_count:,} rows ({pound_percent:.1f}%) have '£' in description")
    print(f"  - {top_match_count - pound_count:,} rows ({100 - pound_percent:.1f}%) don't have '£' in description")
    
    # Show examples of descriptions with pound signs
    if pound_count > 0:
        print("\nExample descriptions with '£' (first 100 characters):")
        for desc in top_match_df[has_pound_in_desc]["description"].sample(min(3, pound_count)).tolist():
            # Find the first occurrence of £ and show context
            pound_pos = desc.find('£')
            start_pos = max(0, pound_pos - 50)
            end_pos = min(len(desc), pound_pos + 50)
            context = desc[start_pos:end_pos]
            print(f"  - ...{context}...")
            print()
    
    # Show examples without pound signs
    if top_match_count - pound_count > 0:
        print("\nExample descriptions without '£' (first 100 characters):")
        for desc in top_match_df[~has_pound_in_desc]["description"].sample(min(3, top_match_count - pound_count)).tolist():
            print(f"  - {desc[:100]}...")
            print()

Found 350 rows (0.2%) with 'TOP MATCH' as salary

Of these 'TOP MATCH' rows:
  - 102 rows (29.1%) have '£' in description
  - 248 rows (70.9%) don't have '£' in description

Example descriptions with '£' (first 100 characters):
  - ...ouse Care Home, 81 Dickens Ln, Stockport SK12 1NT £21 - £22.50 Per Hour DOE Clumber House Care Home ...

  - ...to Join our quality care company You will be paid £16.00 per hour after training completed (provided...

  - ...nths acute clinical experience Min grade value is £40,701. Max grade value is £48,054. provide diete...


Example descriptions without '£' (first 100 characters):
  -              Please Note - Our organisation can not provide work sponsorship, please do not apply fo...

  -             We have an exciting opportunity for a MSK Physiotherapist to join our department name te...

  -             Job Title Care Assistant - Days Salary PS12.48ph Hours Part-time [07:00am - 19:00pm] Loc...



Where are we? We found that some of the 'salary' contains 'top match'. I also reflected that in my R script, I've got rid of these. IN some cases, albeit these are still small numbers, there is a salary that can be found and extracted from the job description. So this needs to be sorted.

WE also need to do PT/FT/hours, and create annualised equivalients. 
Town/city/county/region etc.
Some standardised employer names (Bupa, NHS, Nuffield etc to makej sure we capture these)