## Jobs

In [None]:
!pip install econml

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score # to split the data into train and test datasets 
from sklearn.preprocessing import StandardScaler # use of StandardScaler to standarise the dataset
import numpy as np # library of mathematical operations
import pandas as pd  # for data anlysis and manipulation
import matplotlib.pyplot as plt # to display charts
import seaborn as sns # data visualisation library
from econml.metalearners import XLearner
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

### Loading the dataset

In [None]:
jobs = pd.read_csv("https://raw.githubusercontent.com/dmachlanski/CE888_2022/main/project/data/jobs.csv", delimiter=",")
jobs

### Exploring the dataset

In [None]:
jobs.info()

In [None]:
nrow, ncol = jobs.shape
print(f'There are {nrow} rows and {ncol} columns')

In [None]:
x = jobs[["x1", 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
         'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17']]

In [None]:
# reshaping the single dimension vectors into 2D vectors as some methods prefer this representaion of data
T = jobs['t'].values.reshape(-1, 1)
Y = jobs['y'].values.reshape(-1, 1)
e = jobs['e']

In [None]:
# to plot a boxplot of each feature in the dataset and check if the features vary in scale
plt.figure(figsize=(30,15))
_ = sns.boxplot(data = x)

In [None]:
plt.savefig("jobs_boxplot.pdf")

In [None]:
# to draw histogram and look at the distribution of values of each feature
jobs.hist(bins=50, figsize=(20,20))

In [None]:
plt.savefig("jobs_histogram.pdf")

In [None]:
jobs['y'].value_counts(1)

In [None]:
sns.countplot(x = "y", data = jobs)

In [None]:
plt.savefig("jobs_y.pdf")

In [None]:
# to plot scatterplot across all the features
sns.pairplot(x)

In [None]:
plt.savefig("jobs_scatterplot.pdf")

In [None]:
# to calculate the correlations between each pair of variables
corr = x.corr()

# to plot a heatmap of the correlations between pairs of features
sns.set(rc = {'figure.figsize':(20,15)})
sns.heatmap(corr, annot = True)

In [None]:
plt.savefig("jobs_heatmap.pdf")

### Data Pre-Processing

In [None]:
# to split the data into train and test datasets
x_train, x_test, t_train, t_test, y_train, y_test= train_test_split(x, T, Y, test_size=0.2)

scaler_x = StandardScaler() # to standarise the dataset i.e. mean = 0 and s.d. = 1
x_train = scaler_x.fit_transform(x_train)
x_test = scaler_x.transform(x_test)