In [1]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import glob
import os
pd.options.mode.chained_assignment = None

In [2]:
def load_all_paths_parquet_with_text(country, year):
    path = '../data_with_text/' + country + '/' + year + '/*.parquet'
    return sorted(glob.glob(path))

def load_all_paths_parquet_no_text(country, year):
    path = '../data_no_text/' + country + '/' + year + '/*.parquet'
    return sorted(glob.glob(path))

def load_file(path):
    table = pq.read_table(path)
    return table.to_pandas()

def load_files(country, year, number):
    paths = load_all_paths_parquet_no_text(country, year)[:number]
    myDataFrame = pd.DataFrame()
    for path in paths:
        table = pq.read_table(path).to_pandas()
        myDataFrame = myDataFrame.append(table)
    return myDataFrame

In [3]:
df = load_files('UK', '2016',2)

In [4]:
def expand_skill(df):
    df = df[['JobID', 'SICCode','JobDate','CanonCity','CanonCounty',
             'LocalAuthorityDistrict',
             'CanonSkillClusters', 'SOCCode', 'CanonSkills']]
    df['date'] = pd.to_datetime(df['JobDate'])
    s = df.apply(lambda x: pd.Series(x['CanonSkills']), axis=1).stack(
    ).reset_index(level=1, drop=True) # takes a lot of time 
    s.name = 'CanonSkills'
    s = df.drop('CanonSkills', axis=1).join(s)
   
    return s


In [5]:
df = expand_skill(df) ##this takes a lot of time

In [6]:
## if you want to pritn the columns
# for col in df.columns: 
#     print(col) 

In [33]:
def select_df(df, start_date, end_date, 
              city = False, sic = False, soc = False, county = False):
    mask = (df['date'] >= start_date) & (df['date'] <= end_date)
    df = df.loc[mask]
    if city != False:
        df = df[(df['CanonCity'] == city)]
    if sic != False:
        df = df[(df['SICCode'].str[:4] == sic)]
    if soc != False:
        df = df[(df['SOCCode'].str[:3] == soc)]
    if county != False:
        df = df[(df['CanonCounty'] == county)]
        
    return df

### 1) Calculate frequency counts of CanonSkills for a sub-sampe of job ads tagged as 
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'



In [36]:
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'

s1 = select_df(df, start_date = start_date
               , end_date = end_date, county = county)
skill_count_df = pd.DataFrame(s1['CanonSkills'].value_counts())
skill_count_df

Unnamed: 0,CanonSkills
Communication Skills,326
Sales,175
Planning,149
Organisational Skills,136
Teamwork / Collaboration,134
...,...
Online Communications,1
Red Hat Linux Administration,1
Growth Strategies,1
Architecture Governance,1


### 2) Calculate frequency counts of CanonSkills for a sub-sampe of job ads tagged as 
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'
**for each sic**


In [41]:
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'

s1 = select_df(df, start_date = start_date
               , end_date = end_date, county = county)
s2 = s1.groupby(["CanonSkills", "SICCode"]).size()
pd.DataFrame(s2)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
CanonSkills,SICCode,Unnamed: 2_level_1
.NET,42,1
.NET,45.20,2
.NET,46.42,1
.NET,52.24/3,1
.NET,55.10,1
...,...,...
jQuery,86,1
jQuery,87.90,1
jQuery Mobile,87.90,1
uPerform,55.10,1


### 3) Calculate frequency counts of CanonSkills for a sub-sampe of job ads tagged as 
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'
**for each soc**


In [42]:
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'

s1 = select_df(df, start_date = start_date
               , end_date = end_date, county = county)
s2 = s1.groupby(["CanonSkills", "SOCCode"]).size()
pd.DataFrame(s2)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
CanonSkills,SOCCode,Unnamed: 2_level_1
.NET,1121,1
.NET,1122,1
.NET,1132,2
.NET,1190,1
.NET,1259,2
...,...,...
jQuery,4134,1
jQuery,4159,2
jQuery Mobile,3544,1
uPerform,5434,1


### 4) Calculate frequency counts of CanonSkills for a sub-sampe of job ads tagged as 
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'
**for each LocalAuthorityDistrict**


In [46]:
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'

s1 = select_df(df, start_date = start_date
               , end_date = end_date, county = county)
s2 = s1.groupby(["CanonSkills", "LocalAuthorityDistrict"]).size()
pd.DataFrame(s2)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
CanonSkills,LocalAuthorityDistrict,Unnamed: 2_level_1
.NET,Cherwell,3
.NET,Oxford,35
.NET,Vale of White Horse,2
.NET,West Oxfordshire,1
360 Feedback,Oxford,1
...,...,...
jQuery,South Oxfordshire,2
jQuery,Vale of White Horse,1
jQuery Mobile,Oxford,1
uPerform,Oxford,1


### 5) Calculate frequency counts of CanonSkills for a sub-sampe of job ads tagged as 
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'
**for each CanonCity**



In [48]:
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'

s1 = select_df(df, start_date = start_date
               , end_date = end_date, county = county)
s2 = s1.groupby(["CanonSkills", "CanonCity"]).size()
pd.DataFrame(s2)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
CanonSkills,CanonCity,Unnamed: 2_level_1
.NET,ABINGDON,1
.NET,BANBURY,1
.NET,KIDLINGTON,2
.NET,OXFORD,35
.NET,WANTAGE,1
...,...,...
jQuery,WANTAGE,1
jQuery,WATLINGTON,1
jQuery Mobile,OXFORD,1
uPerform,OXFORD,1
