# Features selection and preprocessing
## By Juan Manuel Franco Islas

In this notebook, I am going to do feature selections and preprocessing to create the dataset to train the machine learning model.

## Import libraries

In [1]:
import pandas as pd

## Loading the dataset

In [2]:
df_main = pd.read_csv('../../reed_uk_2.csv')

In [3]:
df_main.head(5)

Unnamed: 0,category,city,company_name,job_description,job_requirements,job_title,job_type,post_date,state,salary
0,catering jobs,Chudleigh,Haulfryn Group,Apply now New opportunity not to be missed! H...,,Commis Chef,"Permanent, full-time",3/12/2018,Devon,18323.0
1,law jobs,Swindon,Reed,Apply now This role requires an experienced s...,,Corporate Legal Secretary,"Permanent, full-time",3/8/2018,Wiltshire,20000.0
2,factory jobs,Norfolk,Swanstaff Recruitment Ltd,Apply now Job Title: Warehouse personLocation...,,Warehouse Person,"Permanent, full-time",2/27/2018,East Anglia,
3,law jobs,Hurstpierpoint,Castles Solicitors,Apply now Castles Solicitors are looking for ...,Required skills Admin Case Management Corresp...,Legal Secretary / Admin Assistant,"Permanent, full-time or part-time",2/21/2018,West Sussex,16000.0
4,factory jobs,Hertfordshire,Marketplace Group,Apply now International Pharmaceutical Manufa...,,Category Planner - Pharmaceutical,"Permanent, full-time",3/1/2018,South East England,41000.0


### Droping useless columns

In [3]:
def drop_useless_columns(df, drop_list):
    for column in drop_list:
        df.drop([column], axis=1, inplace=True)

In [5]:
drop_list = ['company_name', 'job_description', 'job_requirements', 'city', 'job_title', 'post_date', 'state']
drop_useless_columns(df_main, drop_list)

In [19]:
df_main.head(5)

Unnamed: 0,category,job_type,salary
0,catering jobs,"Permanent, full-time",18323.0
1,law jobs,"Permanent, full-time",20000.0
2,factory jobs,"Permanent, full-time",
3,law jobs,"Permanent, full-time or part-time",16000.0
4,factory jobs,"Permanent, full-time",41000.0


In [20]:
print(f'The number of rows is: {df_main.shape[0]}')
print(" ")
print(f'The number of columns is: {df_main.shape[1]}')

The number of rows is: 50000
 
The number of columns is: 3


In [7]:
df_main.isnull().sum()

category        0
job_type        0
salary      11017
dtype: int64

In [12]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  50000 non-null  object 
 1   job_type  50000 non-null  object 
 2   salary    38983 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.1+ MB


In [13]:
df_main.describe(include='all')

Unnamed: 0,category,job_type,salary
count,50000,50000,38983.0
unique,37,9,
top,health jobs,"Permanent, full-time",
freq,1930,36864,
mean,,,31823.889429
std,,,22554.238833
min,,,3042.0
25%,,,20500.0
50%,,,27019.2
75%,,,37109.0


### Working with "category" and "job_type" as dummies

In [17]:
df_categories = pd.get_dummies(df_main, drop_first=True , dtype=float)

In [18]:
df_categories.head(5)

Unnamed: 0,salary,category_accountancy qualified jobs,category_admin secretarial pa jobs,category_apprenticeships jobs,category_banking jobs,category_catering jobs,category_charity jobs,category_construction property jobs,category_customer service jobs,category_education jobs,...,category_strategy consultancy jobs,category_training jobs,"job_type_Contract, full-time or part-time","job_type_Contract, part-time","job_type_Permanent, full-time","job_type_Permanent, full-time or part-time","job_type_Permanent, part-time","job_type_Temporary, full-time","job_type_Temporary, full-time or part-time","job_type_Temporary, part-time"
0,18323.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,20000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,16000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,41000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [21]:
print(f'The number of rows is: {df_categories.shape[0]}')
print(" ")
print(f'The number of columns is: {df_categories.shape[1]}')

The number of rows is: 50000
 
The number of columns is: 45


In [22]:
df_categories['job_offer_id'] = df_categories.index

In [23]:
df_categories.head(5)

Unnamed: 0,salary,category_accountancy qualified jobs,category_admin secretarial pa jobs,category_apprenticeships jobs,category_banking jobs,category_catering jobs,category_charity jobs,category_construction property jobs,category_customer service jobs,category_education jobs,...,category_training jobs,"job_type_Contract, full-time or part-time","job_type_Contract, part-time","job_type_Permanent, full-time","job_type_Permanent, full-time or part-time","job_type_Permanent, part-time","job_type_Temporary, full-time","job_type_Temporary, full-time or part-time","job_type_Temporary, part-time",job_offer_id
0,18323.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1,20000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
2,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
3,16000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3
4,41000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4


### Splitting the dataset into two datasets, the first one to train the model and the second one to test the model.

In [25]:
df_to_model = df_categories.loc[df_categories['salary'].notnull()]

In [26]:
df_to_model.head(5)

Unnamed: 0,salary,category_accountancy qualified jobs,category_admin secretarial pa jobs,category_apprenticeships jobs,category_banking jobs,category_catering jobs,category_charity jobs,category_construction property jobs,category_customer service jobs,category_education jobs,...,category_training jobs,"job_type_Contract, full-time or part-time","job_type_Contract, part-time","job_type_Permanent, full-time","job_type_Permanent, full-time or part-time","job_type_Permanent, part-time","job_type_Temporary, full-time","job_type_Temporary, full-time or part-time","job_type_Temporary, part-time",job_offer_id
0,18323.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1,20000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
3,16000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3
4,41000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4
5,9126.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5


In [27]:
print(f'The number of rows is: {df_to_model.shape[0]}')
print(" ")
print(f'The number of columns is: {df_to_model.shape[1]}')

The number of rows is: 38983
 
The number of columns is: 46


In [28]:
df_to_model.to_csv(index=False, path_or_buf= '../../to_model.csv')

In [29]:
df_to_test = df_categories.loc[df_categories['salary'].isnull()]

In [30]:
df_to_test.head(5)

Unnamed: 0,salary,category_accountancy qualified jobs,category_admin secretarial pa jobs,category_apprenticeships jobs,category_banking jobs,category_catering jobs,category_charity jobs,category_construction property jobs,category_customer service jobs,category_education jobs,...,category_training jobs,"job_type_Contract, full-time or part-time","job_type_Contract, part-time","job_type_Permanent, full-time","job_type_Permanent, full-time or part-time","job_type_Permanent, part-time","job_type_Temporary, full-time","job_type_Temporary, full-time or part-time","job_type_Temporary, part-time",job_offer_id
2,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2
12,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12
13,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,13
18,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,18
27,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,27


In [31]:
print(f'The number of rows is: {df_to_test.shape[0]}')
print(" ")
print(f'The number of columns is: {df_to_test.shape[1]}')

The number of rows is: 11017
 
The number of columns is: 46


In [32]:
df_to_test.to_csv(index=False, path_or_buf= '../../to_test.csv')

In [4]:
df_main_2 = pd.read_csv('../../reed_uk_2.csv')

In [5]:
drop_list_2 = ['company_name', 'job_description', 'job_requirements', 'city', 'job_title', 'post_date', 'state', 'salary']
drop_useless_columns(df_main_2, drop_list_2)

In [7]:
df_main_2['job_offer_id'] = df_main_2.index

In [8]:
df_main_2.head(5)

Unnamed: 0,category,job_type,job_offer_id
0,catering jobs,"Permanent, full-time",0
1,law jobs,"Permanent, full-time",1
2,factory jobs,"Permanent, full-time",2
3,law jobs,"Permanent, full-time or part-time",3
4,factory jobs,"Permanent, full-time",4


In [9]:
df_main_2.to_csv(index=False, path_or_buf= '../../to_api.csv')