In [None]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import glob
import os
pd.options.mode.chained_assignment = None

In [2]:
def load_all_paths_parquet_with_text(country, year):
    path = '../data_with_text/' + country + '/' + year + '/*.parquet'
    return sorted(glob.glob(path))

def load_all_paths_parquet_no_text(country, year):
    path = '../data_no_text/' + country + '/' + year + '/*.parquet'
    return sorted(glob.glob(path))

def load_file(path):
    table = pq.read_table(path)
    return table.to_pandas()

def load_files(country, year, number):
    paths = load_all_paths_parquet_no_text(country, year)[:number]
    myDataFrame = pd.DataFrame()
    for path in paths:
        table = pq.read_table(path).to_pandas()
        myDataFrame = myDataFrame.append(table)
    return myDataFrame

In [3]:
df = load_files('UK', '2016',2)

In [9]:

def element_to_list(element):
    return str(element).replace(";", "&").replace('"', "").split("&")

def expand_skill(df):
    df = df[['JobID', 'SICCode','JobDate','CanonCity','CanonCounty',
             'LocalAuthorityDistrict',
             'CanonSkillClusters', 'SOCCode', 'CanonSkills']]
    df['date'] = pd.to_datetime(df['JobDate'])
    
    df['CanonSkillClusters'] = df['CanonSkillClusters'].apply(element_to_list)
    s = df.apply(lambda x: pd.Series(x['CanonSkillClusters']), axis=1).stack(
    ).reset_index(level=1, drop=True) # takes a lot of time 
    s.name = 'CanonSkillClusters'
    s = df.drop('CanonSkillClusters', axis=1).join(s)
   
    return s


In [10]:
df = expand_skill(df) ##this takes a lot of time

In [11]:
df['CanonSkillClusters'].iloc[0]

"['Information Technology: Microsoft Development Tools', 'Specialised Skills|Information Technology: Web Development', 'Specialised Skills|Design: Graphic and Visual Design Software', 'Specialised Skills|Information Technology: JavaScript and jQuery', 'Specialised Skills|Information Technology: Web Development', 'Specialised Skills|Information Technology: Microsoft Development Tools', 'Specialised Skills|Specialised Skills|Information Technology: Web Development', 'Specialised Skills|Information Technology: Version Control', 'Specialised Skills|Information Technology: Web Development', 'Specialised Skills|Information Technology: JavaScript and jQuery', 'Specialised Skills|Information Technology: JavaScript and jQuery', 'Specialised Skills|Design: Graphic and Visual Design', 'Specialised Skills|Design: User Interface and User Experience (UI/UX) Design', 'Specialised Skills|Information Technology: Web Development', 'Specialised Skills|Information Technology: Web Design', 'Specialised Ski

In [None]:
## if you want to pritn the columns
# for col in df.columns: 
#     print(col) 

In [None]:
def select_df(df, start_date, end_date, 
              city = False, sic = False, soc = False, county = False):
    mask = (df['date'] >= start_date) & (df['date'] <= end_date)
    df = df.loc[mask]
    if city != False:
        df = df[(df['CanonCity'] == city)]
    if sic != False:
        df = df[(df['SICCode'].str[:4] == sic)]
    if soc != False:
        df = df[(df['SOCCode'].str[:3] == soc)]
    if county != False:
        df = df[(df['CanonCounty'] == county)]
        
    return df

### 1) Calculate frequency counts of CanonSkills for a sub-sampe of job ads tagged as 
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'



In [None]:
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'

s1 = select_df(df, start_date = start_date
               , end_date = end_date, county = county)
skill_count_df = pd.DataFrame(s1['CanonSkills'].value_counts())
skill_count_df

### 2) Calculate frequency counts of CanonSkills for a sub-sampe of job ads tagged as 
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'
**for each sic**


In [None]:
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'

s1 = select_df(df, start_date = start_date
               , end_date = end_date, county = county)
s2 = s1.groupby(["CanonSkills", "SICCode"]).size()
pd.DataFrame(s2)

### 3) Calculate frequency counts of CanonSkills for a sub-sampe of job ads tagged as 
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'
**for each soc**


In [None]:
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'

s1 = select_df(df, start_date = start_date
               , end_date = end_date, county = county)
s2 = s1.groupby(["CanonSkills", "SOCCode"]).size()
pd.DataFrame(s2)

### 4) Calculate frequency counts of CanonSkills for a sub-sampe of job ads tagged as 
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'
**for each LocalAuthorityDistrict**


In [None]:
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'

s1 = select_df(df, start_date = start_date
               , end_date = end_date, county = county)
s2 = s1.groupby(["CanonSkills", "LocalAuthorityDistrict"]).size()
pd.DataFrame(s2)

### 5) Calculate frequency counts of CanonSkills for a sub-sampe of job ads tagged as 
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'
**for each CanonCity**



In [None]:
start_date = '2016-01-01'
end_date = '2016-01-04'
county = 'Oxfordshire'

s1 = select_df(df, start_date = start_date
               , end_date = end_date, county = county)
s2 = s1.groupby(["CanonSkills", "CanonCity"]).size()
pd.DataFrame(s2)