In [None]:
import pandas as pd
import sys
import re
import pickle
import concurrent.futures
import matplotlib.pyplot as plt
import time

In [None]:
# adding module to the system path
sys.path.append('..')
from craiglist_gig_scraper import CraiglistGigScraper

## Problem
The task is to scrape the gigs section of Craigslist Boston and figure out how much money someone could make per day if they did all the gigs.

## Methodology

1. Click on the 'paid' filter and 'bundle duplicates' checkbox to reduce noise in data
2. Create a list of all the posts urls
3. Open each url and for each post extract date, title, compensation, and type
4. Store scrape date in a dataframe with columns (title, timestamp(date), compensation, type)
5. Preform groupby sum aggregation on date and compensation to determine compensation per day

In [None]:
URL = 'https://boston.craigslist.org/search/gbs/ggg'
scraper_one = CraiglistGigScraper()
scraper_one.load_craigslist_url(URL)

In [None]:
# Get all gig links
links = scraper_one.extract_gig_links()

In [None]:
def pickle_scrape(links, filename, action=''):
    '''
    Implements binary protocols for serializing and de-serializing objects
    :param links: List
    :param filename: String
    :param action: String
    '''
    match action:
        case 'dump':
            with open(f'{filename}.pkl', 'wb') as file:
                return pickle.dump(links, file)
        case 'load':
            with open(f'{filename}.pkl', 'rb') as file:
                return pickle.load(file)

In [None]:
cleaned_links = list(set(links))[:20]; cleaned_links

### Step 1: Collect
Gather data from Craiglist Gig section

#### Step 1A: Launching parallel tasks
Use ThreadPoolExecutor to asynchronously execute scrape class method.

* CPU times: user 20.5 s, sys: 15.7 s, total: 36.2 s
* Wall time: 28min 40s

In [None]:
%%time
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []
    for link in cleaned_links:
        # submit fn schedules the callable and returns a future object representing the execution of the callable
        futures.append(executor.submit(scraper_one.extract_gig_information, url=link))
        time.sleep(3)

In [None]:
# List comp to unpack values returned by the call
gigs = [future.result() for future in concurrent.futures.as_completed(futures)]

In [None]:
gig_df = pd.DataFrame(gigs, columns=['title', 'timestamp', 'compensation', 'type']); gig_df

### Step 2: Clean
Use Pandas and Python string manipulation to format text and number in a dataframe

In [None]:
gig_df = pd.read_csv('./data/gig_data_all.csv')

In [None]:
def has_numbers(string):
    '''Returns a boolean value if string is not a numeric'''
    return any(char.isdigit() for char in string)

In [None]:
# Create date and time columns from timestamp
gig_df['time'] = gig_df['timestamp'].apply(lambda x: x.split('T')[0])
# gig_df['date'] = gig_df['timestamp'].apply(lambda x: x.split(' '))

In [None]:
# Convert timestamp column to datetime
gig_df['timestamp'] =  pd.to_datetime(gig_df['timestamp'])

# Create day of week column based on timestamp
gig_df['day_of_week'] = gig_df['timestamp'].dt.day_name()

In [None]:
# Filter out strings that don't has a float or integer value
gig_df['has_compensation'] = gig_df['compensation'].apply(lambda x: has_numbers(x))

# Drop rows that don't have a numeric value
gig_df.drop(gig_df[gig_df.has_compensation == False].index, inplace=True)

In [None]:
# Apply regex pattern to extract values with '$'
gig_df['compensation'] = gig_df['compensation'].str.extract('(\$[0-9,.]+)', expand=False)

In [None]:
# Clean and format compensation value
gig_df = gig_df[gig_df['compensation'].notna()];
gig_df['compensation'] = gig_df['compensation'].replace({'\$':'', ',': ''}, regex=True)

In [None]:
gig_df['compensation'] = gig_df['compensation'].apply(lambda x: int(float(x)))

In [None]:
# Check gigs over a thousand
over_a_thousand = gig_df[gig_df['compensation'] >= 1000]

In [None]:
# Filter out gigs greater than or equal to 5k
filtered_gigs = gig_df[gig_df['compensation'] <= 5000]; filtered_gigs

In [None]:
# Sum of compensation
aggregated_df = filtered_gigs.groupby(['date'], as_index=False)['compensation'].sum()
aggregated_df = aggregated_df.rename(columns={'compensation': 'sum'})

# Average of compensation
aggregated_df['sum'].mean()

### Step 3: Visualize
Use Matiplotlib to visualize data set

In [None]:
# Pie chart
filtered_gigs.groupby(['type']).sum().plot(kind='pie', y='compensation')

In [None]:
# Bar chart
plt.bar(filtered_gigs.date, filtered_gigs.compensation)
plt.xticks(rotation=90)
plt.show()

### Step 4: Takeaways
Discuss valuable takeaways and potential next steps

1. Challenging data source 
    - What is the persona?
    - How might someone ingest this data?
2. Not representative sample because the paid filter returns gigs that don't mention any monetary value
3. Assumptions
    - How much time does this person have to work?
    - Their distance relative to the gig location
    - The type of gig they are most interested in
    - The amount of jobs someone can complete
    - The persons mode of transportation -- job requirements
    - Gig duration
4. Constraints 
    - CPU
    - Time
5. Decisions
    - Drop non-numeric compensation values
    - Filter out greater 5k
    - Extract compensation values with'$'
6. Issues with compensation values
    - Project-based
    - Salary
    - Per hour
7. Working with compensation values
    - Apply ML (NLP)
    - Pull in external APIs (Google Maps)

### Tests
Try out somethings

In [None]:
# Test if a pattern or regex is contained within a string of a Series or Index.
keyword_search_for = ['per', 'hour', 'hr']
gig_df[gig_df.stack().str.contains('|'.join(keyword_search_for)).any(level=0)]

In [None]:
# Getting frequency counts of a column values
count = gig_df['compensation'].value_counts(); count

In [None]:
word_frequency = gig_df['compensation'].str.split(expand=True).stack().value_counts() 
pd.DataFrame(word_frequency, columns=['frequency'])

In [None]:
test_strings_list = ['compensation: $18-$24', 'compensation: $75 to $150 Daily']
regex_split = [re.split('; |, |\*|\n|to|-|:| ', _str) for _str in test_strings_list]; regex_split