In [1]:
import numpy as np
import pandas as pd

In [2]:
# This notebook follows steps to create a mapping file for code variables in Labour Force Survey files 

# Loading the supplemental table

In [3]:
#putting the table into a pandas dataframe with proper encoding
table=pd.read_csv('Datasets/LFS_PUMF_EPA_FGMD_variables.csv', encoding='ISO-8859-1')

#looking at the head of the table
table.head()

Unnamed: 0,Position / \nPosition,Length / \nLongueur,Start / \nDébut,End / \nFin,Variable / \nVariable,Variable Name - English / \nNom de Variable - Anglais,Variable Name - French / \nNom de Variable - Francais,Note 1 - English,Note 1 - French,Note 2 - English,Note 2 - French,Code / \nCode,Label - English /\nÉtiquette - Anglais,Label - French /\nÉtiquette - Francais
0,1.0,7.0,1976.0,,rec_num,Order of record in file,Ordre de l'observation dans le fichier,,,,,,,
1,,,,,,,,,,,,1-9999999,,
2,8.0,4.0,1976.0,,survyear,Survey year,Année d'enquête,,,,,,,
3,,,,,,,,,,,,1976-,,
4,12.0,2.0,1976.0,,survmnth,Survey month,Mois d'enquête,,,,,,,


In [4]:
#getting a list of the column names from the table
table.columns.values

array(['Position / \nPosition', 'Length / \nLongueur', 'Start / \nDébut',
       'End / \nFin', 'Variable / \nVariable',
       'Variable Name - English / \nNom de Variable - Anglais',
       'Variable Name - French / \nNom de Variable - Francais',
       'Note 1 - English', 'Note 1 - French', 'Note 2 - English',
       'Note 2 - French', 'Code / \nCode',
       'Label - English /\nÉtiquette - Anglais',
       'Label - French /\nÉtiquette - Francais'], dtype=object)

In [5]:
# since there are null values between the variable names, creating a new sorted dataframe with all Variables that are not null  
tablenotnull=table[table['Variable / \nVariable'].notnull()]

#resetting the index of the new table to better find the location of the next value
tablenotnull.reset_index(inplace=True)

In [6]:
#looking aat the top rows of the new table, will be using the index column to find the range between the variables
tablenotnull.head()

Unnamed: 0,index,Position / \nPosition,Length / \nLongueur,Start / \nDébut,End / \nFin,Variable / \nVariable,Variable Name - English / \nNom de Variable - Anglais,Variable Name - French / \nNom de Variable - Francais,Note 1 - English,Note 1 - French,Note 2 - English,Note 2 - French,Code / \nCode,Label - English /\nÉtiquette - Anglais,Label - French /\nÉtiquette - Francais
0,0,1.0,7.0,1976,,rec_num,Order of record in file,Ordre de l'observation dans le fichier,,,,,,,
1,2,8.0,4.0,1976,,survyear,Survey year,Année d'enquête,,,,,,,
2,4,12.0,2.0,1976,,survmnth,Survey month,Mois d'enquête,,,,,,,
3,17,14.0,1.0,2017,,lfsstat,Labour force status,Situation d'activité,,,,,,,
4,22,15.0,2.0,1976,,prov,Province,Province,,,,,,,


# Writing a function to find the positions of the variable in the given table

In [7]:
#function returns the start and end position of the variable

def vardf(var, df=table, df2=tablenotnull):
    #start is the index value of the variable of concern
    start=df[df['Variable / \nVariable']==var].index.values[0]
    #pos provides the index number of the variable start position in the sorted dataframe
    pos=df2[df2['index']==start].index.values[0]
    #end provides the value of the index when the dataframe ends
    end=df2.iloc[pos+1]['index']
    #returns the start, end position of the variable, start+1 because that is the first location of the variable value
    return start+1, end

In [8]:
#lets look at the result of one of the variables
vardf('age_12')
#this result means that for variable 'age_12', the position of values in the variable starts at row 45 and ends 56

(45, 57)

In [9]:
#checking that as the location parameters are passed into the table, it would return a dataframe with the var values
table.iloc[45:57]

Unnamed: 0,Position / \nPosition,Length / \nLongueur,Start / \nDébut,End / \nFin,Variable / \nVariable,Variable Name - English / \nNom de Variable - Anglais,Variable Name - French / \nNom de Variable - Francais,Note 1 - English,Note 1 - French,Note 2 - English,Note 2 - French,Code / \nCode,Label - English /\nÉtiquette - Anglais,Label - French /\nÉtiquette - Francais
45,,,,,,,,,,,,1,15 to 19 years,15 à 19 ans
46,,,,,,,,,,,,2,20 to 24 years,20 à 24 ans
47,,,,,,,,,,,,3,25 to 29 years,25 à 29 ans
48,,,,,,,,,,,,4,30 to 34 years,30 à 34 ans
49,,,,,,,,,,,,5,35 to 39 years,35 à 39 ans
50,,,,,,,,,,,,6,40 to 44 years,40 à 44 ans
51,,,,,,,,,,,,7,45 to 49 years,45 à 49 ans
52,,,,,,,,,,,,8,50 to 54 years,50 à 54 ans
53,,,,,,,,,,,,9,55 to 59 years,55 à 59 ans
54,,,,,,,,,,,,10,60 to 64 years,60 à 64 ans


# Cleaning of noc_40 variable

In [10]:
# it is discovered that the variable noc_40 contains blanks, we will need to remove the blank to pass this var into 
# our function

In [11]:
#chekcing the poisition of the noc_40 variable
table[table['Variable / \nVariable'].str.contains('noc_40')==True]

Unnamed: 0,Position / \nPosition,Length / \nLongueur,Start / \nDébut,End / \nFin,Variable / \nVariable,Variable Name - English / \nNom de Variable - Anglais,Variable Name - French / \nNom de Variable - Francais,Note 1 - English,Note 1 - French,Note 2 - English,Note 2 - French,Code / \nCode,Label - English /\nÉtiquette - Anglais,Label - French /\nÉtiquette - Francais
144,43.0,2.0,2017,,noc_40,Occupation at main job,Profession à l'emploi principal,Currently employed or worked within the past 1...,Présentement occupé ou travaillé au cours des ...,Codes based on the 2016 NOC,Codes selon le CNP de 2016,,,


In [12]:
#look at the current value of the variable
table.iloc[144]['Variable / \nVariable']

'noc_40    '

In [13]:
#assign current value as old value
old=table.iloc[144]['Variable / \nVariable']

In [14]:
#strip the white spaces of the value, assign this as new value
new=table.iloc[144]['Variable / \nVariable'].strip()

In [15]:
#replace the old value with new value
table['Variable / \nVariable']=table['Variable / \nVariable'].replace(old,new)

In [16]:
#now checking that this variable can be used in our function
vardf('noc_40')

(145, 186)

# Creating a function to store all required variable values as dataframes

In [18]:
def reftables(v):
    #assign start and stop values returned by function created above to locate variable values
    x, y=vardf(v)
    #store temp df with the required columns
    tempdf=pd.DataFrame(table.iloc[x:y][['Code / \nCode',
                                         'Label - English /\nÉtiquette - Anglais',
                                         'Label - French /\nÉtiquette - Francais']])
    #renaming the columns to the required format
    tempdf=tempdf.rename(columns={'Code / \nCode':'code',
                                  'Label - English /\nÉtiquette - Anglais':'en_label',
                                  'Label - French /\nÉtiquette - Francais':'fr_label'})
    #replacing 'blank' with 0 in order to convert to integer type
    tempdf['code']=tempdf['code'].replace('blank',0)
    
    #converting each column type to the required format
    tempdf['code']=tempdf['code'].astype('int64')
    tempdf['en_label']=tempdf['en_label'].astype(str)
    tempdf['fr_label']=tempdf['fr_label'].astype(str)
    return tempdf

# Creating all required reference dataframes

In [19]:
#passing the list of required variables into above function to create dataframes of reference files

naics_21=reftables('naics_21')
prov=reftables('prov')
educ=reftables('educ')
noc_40=reftables('noc_40')
age_12=reftables('age_12')

In [21]:
#writing the first reference dataframes as a csv file

naics_21.to_csv('Datasets/ref_naics_21.csv', 
                encoding='utf-8', 
                index=False)

In [24]:
#writing all remaining reference dataframes as csv files, following the same reference format

prov.to_csv('Datasets/ref_prov.csv', encoding='utf-8', index=False)
educ.to_csv('Datasets/ref_educ.csv', encoding='utf-8', index=False)
noc_40.to_csv('Datasets/ref_noc_40.csv', encoding='utf-8', index=False)
age_12.to_csv('Datasets/ref_age_12.csv', encoding='utf-8', index=False)

In [25]:
#end of this task

# Generating reference dataframes for Task 3

In [48]:
#cleaning all spaces from the variable column

In [37]:
table['Variable / \nVariable']=table['Variable / \nVariable'].str.strip()

In [39]:
lfsstat=reftables('lfsstat')

In [40]:
sex=reftables('sex')

In [41]:
ftptmain=reftables('ftptmain')

In [42]:
whypt=reftables('whypt')

In [44]:
#writing additional tables as dataframes to csv files

lfsstat.to_csv('Datasets/ref_lfsstat.csv', encoding='utf-8', index=False)
sex.to_csv('Datasets/ref_sex.csv', encoding='utf-8', index=False)
ftptmain.to_csv('Datasets/ref_ftptmain.csv', encoding='utf-8', index=False)
whypt.to_csv('Datasets/ref_whypt.csv', encoding='utf-8', index=False)

In [45]:
#writing more reference tables to be used in proceeding

noc_10=reftables('noc_10')

In [47]:
#writing additional tables as dataframes to csv files

noc_10.to_csv('Datasets/ref_noc_10.csv', encoding='utf-8', index=False)