In [3]:
import pandas as pd
from datetime import datetime

In [4]:
df_all = pd.read_csv('combined-jobs.csv')
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3166 entries, 0 to 3165
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   AY                3166 non-null   object 
 1   Institution       3166 non-null   object 
 2   Position          3166 non-null   object 
 3   Address           3166 non-null   object 
 4   Link              3023 non-null   object 
 5   TrackType         3166 non-null   object 
 6   DateAdded         3166 non-null   object 
 7   DateTimeObj       3166 non-null   object 
 8   Latitude          3166 non-null   float64
 9   Longitude         3166 non-null   float64
 10  FormattedAddress  3166 non-null   object 
 11  City              3097 non-null   object 
 12  County            2861 non-null   object 
 13  State             3143 non-null   object 
 14  StateCode         3055 non-null   object 
 15  Country           3165 non-null   object 
 16  CountryCode       3166 non-null   object 


## Create TrackType Data SubSet

Write a new file with per date sum of job type post counts:

{date, posts: INT total of TT, type: "TT"},
{date, posts: INT total of NTT, type: "NTT"},
{date, posts: INT total of Unavailable, type: "Unavailable"},

In [5]:
def create_subset(df, subset_col_1, subset_col_dates):
  '''
  # @create_subset: Create subsets of TrackType data

  # Params
  - df: pandas DataFrame. Data set with all parameters of interest
  - subset_col: String. Column of interest.
  
  ## Return
  - new_df: pandas DataFrame. New dataframe with following structure per row: 
    - date: DateTime Obj of posting date.
    - type: String. Type of track: TT, NTT, Unavailable.
    - posts: Integer. Total sum of posts of that type of job on that specific date.
  '''
  uniq_subset_col_values = df[subset_col_1].unique()
  uniq_subset_col_dates = sorted(df[subset_col_dates].apply(lambda d: datetime.strptime(d,'%m/%d/%Y')).unique())
  list_unique_dates_sorted = []
  for date in uniq_subset_col_dates:
    list_unique_dates_sorted.append(date.strftime("%m/%d/%Y"))
  print(uniq_subset_col_values)
  print(uniq_subset_col_dates)
  print(list_unique_dates_sorted)

  list_record_rows_per_date = []
  list_record_rows_flat = []
  for date_str in list_unique_dates_sorted:
    if date_str != '01/01/1900':
      date_slice = df.loc[df.DateAdded == date_str]
      list_record_rows_per_date.append({
        'date': date_str,
        'datetimeObj': datetime.strptime(date_str, '%m/%d/%Y'),
        'TT': len(date_slice.loc[date_slice.TrackType == 'TT']),
        'NTT': len(date_slice.loc[date_slice.TrackType == 'NTT']),
        'Unavailable': len(date_slice.loc[date_slice.TrackType == 'Unavailable']),
        'Total': len(date_slice.loc[date_slice.TrackType == 'TT'])+len(date_slice.loc[date_slice.TrackType == 'NTT'])+len(date_slice.loc[date_slice.TrackType == 'Unavailable'])
      })

      for tt in uniq_subset_col_values:
        tt_date_slice = date_slice.loc[date_slice.TrackType == tt]
        list_record_rows_flat.append({
          'date': date_str,
          'datetimeObj': datetime.strptime(date_str, '%m/%d/%Y'),
          'type': tt,
          'posts': len(tt_date_slice)
        })
  df_new_flat = pd.DataFrame(list_record_rows_flat)
  df_new_per_date = pd.DataFrame(list_record_rows_per_date)
  # Add moving averages
  df_new_flat['Overall_EMA'] = df_new_flat['posts'].ewm(span=7).mean()        # Exponential Moving Average
  df_new_per_date['Overall_EMA'] = df_new_per_date['Total'].ewm(span=7).mean()        # Exponential Moving Average
  return {
    'df_new_flat': df_new_flat,
    'df_new_per_date': df_new_per_date,
  }


In [6]:
df__tt_per_date_flat = create_subset(df_all, subset_col_1='TrackType', subset_col_dates='DateAdded')
df__tt_per_date_flat['df_new_per_date'].info()

['TT' 'NTT' 'Unavailable']
[Timestamp('1900-01-01 00:00:00'), Timestamp('2012-09-15 00:00:00'), Timestamp('2013-01-15 00:00:00'), Timestamp('2013-01-20 00:00:00'), Timestamp('2013-01-23 00:00:00'), Timestamp('2013-01-24 00:00:00'), Timestamp('2013-01-28 00:00:00'), Timestamp('2013-02-01 00:00:00'), Timestamp('2013-02-03 00:00:00'), Timestamp('2013-02-04 00:00:00'), Timestamp('2013-02-13 00:00:00'), Timestamp('2013-08-15 00:00:00'), Timestamp('2013-08-16 00:00:00'), Timestamp('2013-08-19 00:00:00'), Timestamp('2013-08-21 00:00:00'), Timestamp('2013-08-23 00:00:00'), Timestamp('2013-08-26 00:00:00'), Timestamp('2013-08-27 00:00:00'), Timestamp('2013-08-28 00:00:00'), Timestamp('2013-08-30 00:00:00'), Timestamp('2013-09-01 00:00:00'), Timestamp('2013-09-02 00:00:00'), Timestamp('2013-09-03 00:00:00'), Timestamp('2013-09-04 00:00:00'), Timestamp('2013-09-05 00:00:00'), Timestamp('2013-09-06 00:00:00'), Timestamp('2013-09-08 00:00:00'), Timestamp('2013-09-09 00:00:00'), Timestamp('2013-09-1

In [7]:
df__tt_per_date_flat['df_new_per_date'].head()

Unnamed: 0,date,datetimeObj,TT,NTT,Unavailable,Total,Overall_EMA
0,09/15/2012,2012-09-15,0,1,0,1,1.0
1,01/15/2013,2013-01-15,0,1,0,1,1.0
2,01/20/2013,2013-01-20,0,1,0,1,1.0
3,01/23/2013,2013-01-23,2,0,0,2,1.365714
4,01/24/2013,2013-01-24,1,0,0,1,1.245839


In [8]:
df__tt_per_date_flat['df_new_flat'].to_csv('tt-per-date-flat.csv', index=False)

In [9]:
df__tt_per_date_flat['df_new_per_date'].to_csv('tt-per-date.csv', index=False)