**Importing Necessary Libraries**

In [None]:
!pip -q install pyspark #installing pyspark

In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, MinMaxScaler, OneHotEncoder, FeatureHasher, Imputer, PolynomialExpansion
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

**Acquiring Data**

In [None]:
#procedure to download kaggle datasets to google drive
os.environ['KAGGLE_USERNAME'] = "lalit871" # username from the json file
os.environ['KAGGLE_KEY'] = "b44f57be0b3e13d58bcf61e93baab8a9" # key from the json file

%cd /content/drive/MyDrive/bigData

!kaggle datasets download -d arashnic/hr-analytics-job-change-of-data-scientists

/content/drive/MyDrive/bigData
Downloading hr-analytics-job-change-of-data-scientists.zip to /content/drive/MyDrive/bigData
  0% 0.00/295k [00:00<?, ?B/s]
100% 295k/295k [00:00<00:00, 20.1MB/s]


In [None]:
!unzip -q  hr-analytics-job-change-of-data-scientists.zip # unzipping zip file

**Importing Data**

In [None]:
dataPath = '/content/drive/MyDrive/bigData/aug_train.csv' # declaring the path of th data

In [None]:
sc = SparkSession.builder.master('local').appName('LogisticHR').getOrCreate() #starting a spark session

In [None]:
df = sc.read.csv(dataPath, inferSchema=True, header=True) # reading csv file
df.show(5) # first five rows

+-----------+--------+----------------------+------+--------------------+-------------------+---------------+----------------+----------+------------+--------------+------------+--------------+------+
|enrollee_id|    city|city_development_index|gender| relevent_experience|enrolled_university|education_level|major_discipline|experience|company_size|  company_type|last_new_job|training_hours|target|
+-----------+--------+----------------------+------+--------------------+-------------------+---------------+----------------+----------+------------+--------------+------------+--------------+------+
|       8949|city_103|                  0.92|  Male|Has relevent expe...|      no_enrollment|       Graduate|            STEM|       >20|        null|          null|           1|            36|   1.0|
|      29725| city_40|    0.7759999999999999|  Male|No relevent exper...|      no_enrollment|       Graduate|            STEM|        15|       50-99|       Pvt Ltd|          >4|            47|   

In [None]:
df.printSchema() # schema of the dataset

root
 |-- enrollee_id: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- city_development_index: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- relevent_experience: string (nullable = true)
 |-- enrolled_university: string (nullable = true)
 |-- education_level: string (nullable = true)
 |-- major_discipline: string (nullable = true)
 |-- experience: string (nullable = true)
 |-- company_size: string (nullable = true)
 |-- company_type: string (nullable = true)
 |-- last_new_job: string (nullable = true)
 |-- training_hours: integer (nullable = true)
 |-- target: double (nullable = true)



In [None]:
df.describe().show() #summary statistics of the dataset
# Here we can also see that 19158 is the total number of rows and columns having values less than that number have missing values which we will impute in the below cells.

+-------+------------------+-------+----------------------+------+--------------------+-------------------+---------------+----------------+-----------------+------------+-------------------+------------------+-----------------+-------------------+
|summary|       enrollee_id|   city|city_development_index|gender| relevent_experience|enrolled_university|education_level|major_discipline|       experience|company_size|       company_type|      last_new_job|   training_hours|             target|
+-------+------------------+-------+----------------------+------+--------------------+-------------------+---------------+----------------+-----------------+------------+-------------------+------------------+-----------------+-------------------+
|  count|             19158|  19158|                 19158| 14650|               19158|              18772|          18698|           16345|            19093|       13220|              13018|             18735|            19158|              19158|
|   

In [None]:
# checking the count of various values of major_discipline
df.groupBy("major_discipline").count().orderBy('count', ascending=False).show()

+----------------+-----+
|major_discipline|count|
+----------------+-----+
|            STEM|14492|
|            null| 2813|
|      Humanities|  669|
|           Other|  381|
| Business Degree|  327|
|            Arts|  253|
|        No Major|  223|
+----------------+-----+



In [None]:
# checking the count of various values of gender
df.groupBy("gender").count().orderBy('count', ascending=False).show()

+------+-----+
|gender|count|
+------+-----+
|  Male|13221|
|  null| 4508|
|Female| 1238|
| Other|  191|
+------+-----+



In [None]:
# checking the count of various values of company_type
df.groupBy("company_type").count().orderBy('count', ascending=False).show()

+-------------------+-----+
|       company_type|count|
+-------------------+-----+
|            Pvt Ltd| 9817|
|               null| 6140|
|     Funded Startup| 1001|
|      Public Sector|  955|
|Early Stage Startup|  603|
|                NGO|  521|
|              Other|  121|
+-------------------+-----+



In [None]:
# checking the count of various values of company_size
df.groupBy("company_size").count().orderBy('count', ascending=False).show()

+------------+-----+
|company_size|count|
+------------+-----+
|        null| 5938|
|       50-99| 3083|
|     100-500| 2571|
|      10000+| 2019|
|       10/49| 1471|
|   1000-4999| 1328|
|         <10| 1308|
|     500-999|  877|
|   5000-9999|  563|
+------------+-----+



In [None]:
df.groupby('target').count().show() # checking the balance of the class which will help while playing around with the threshold of the model

+------+-----+
|target|count|
+------+-----+
|   0.0|14381|
|   1.0| 4777|
+------+-----+



Columns gender, company_size, company_type and major discpline have very big number of missing values. Dropping the null values to continue further.

In [None]:
# dropping the null values
df = df.dropna()
df.describe().show()

+-------+------------------+-------+----------------------+------+--------------------+-------------------+---------------+----------------+-----------------+------------+-------------------+------------------+-----------------+-------------------+
|summary|       enrollee_id|   city|city_development_index|gender| relevent_experience|enrolled_university|education_level|major_discipline|       experience|company_size|       company_type|      last_new_job|   training_hours|             target|
+-------+------------------+-------+----------------------+------+--------------------+-------------------+---------------+----------------+-----------------+------------+-------------------+------------------+-----------------+-------------------+
|  count|              8955|   8955|                  8955|  8955|                8955|               8955|           8955|            8955|             8955|        8955|               8955|              8955|             8955|               8955|
|   

In [None]:
# after dropping the null values class imbalance has become worse
# count of data has also reduced by almost half
df.groupby('target').count().show()

+------+-----+
|target|count|
+------+-----+
|   0.0| 7472|
|   1.0| 1483|
+------+-----+



**String Indexer**

In [None]:
cols = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job']

# looping through the categorical columns of the dataset to use string Indexer 
stages = [] # to store the string indexer and vector assembler stages
for col in cols:
    strIndex = StringIndexer(inputCol=col, outputCol=col+"Ind")
    stages = stages + [strIndex]

**Vector Assenbler**

In [None]:
colV = ['city_development_index' , 'training_hours']

# using the string Indexed columns to create features using vector assembler
inpV = [k+"Ind" for k in cols] + colV
assembler = VectorAssembler(inputCols=inpV, outputCol="features")
stages = stages + [assembler]

**Creating Pipeline**

In [None]:
pipeline = Pipeline(stages = stages) # using the earlier defined stages
pipelineModel = pipeline.fit(df)
transformed_df = pipelineModel.transform(df)

**Polynomial Expansion**

In [None]:
polyExpansion = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures") # degree 2 gave the best results
poly_feature_dataframe = polyExpansion.transform(transformed_df)

In [None]:
poly_feature_dataframe.select('polyFeatures').show(10) # expanded the features from 12 to 90 (2n + nC2)

+--------------------+
|        polyFeatures|
+--------------------+
|(90,[0,1,5,6,8,27...|
|(90,[0,1,14,15,19...|
|(90,[0,1,27,28,34...|
|(90,[27,34,65,72,...|
|(90,[27,34,35,42,...|
|(90,[0,1,35,36,43...|
|(90,[0,1,27,28,34...|
|(90,[0,1,14,15,19...|
|[3.0,9.0,0.0,0.0,...|
|(90,[0,1,20,21,26...|
+--------------------+
only showing top 10 rows



**Fitting the data**

In [None]:
train, test = poly_feature_dataframe.randomSplit([0.8, 0.2], seed = 40) # splitting the data in the ratio 80:20

# using standardisation and threshold=0.4 gave the best results
# since class 0 are very high in number hence lowering the threshold to 0.4 helped to increase the recall value.
lr = LogisticRegression(featuresCol = 'polyFeatures', labelCol = 'target', maxIter=10, standardization=True, threshold=0.4)
lrModel = lr.fit(train)

predictions = lrModel.transform(test)

**Evaluation**

f1 score is the harmonic mean of precision and recall hence penalises the score if false positives are more.

In [None]:
# evaluating using two metrics, accuracy and f1 score
eval = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction",metricName='accuracy')
print('Accuracy: ', eval.evaluate(predictions))

print('F1 score: ', eval.evaluate(predictions, {eval.metricName: "f1"}))

Accuracy:  0.8552338530066815
F1 score:  0.8502745460429203


**Using Feature Hasher**

In [None]:
#using 6 columns gave the best result
hasher = FeatureHasher(numFeatures=6,inputCols=['city','city_development_index','gender','relevent_experience','enrolled_university','education_level',
                                                'major_discipline', 'experience', 'company_size', 'company_type','last_new_job','training_hours'],
                                                         outputCol="features")

hashed_dataset = hasher.transform(df) # transforming the dataset

In [None]:
train, test = hashed_dataset.randomSplit([0.8, 0.2], seed = 40) # splitting data in the ratio 80:20

In [None]:
#fitting the training data
lr = LogisticRegression(featuresCol = 'features', labelCol = 'target', maxIter=10)
lrModel = lr.fit(train)

In [None]:
predictions = lrModel.transform(test) # predicting using trained model

In [None]:
# evaluating results using f1 score and accuracy

eval = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction",metricName='accuracy')
print('Accuracy: ', eval.evaluate(predictions))

print('F1 score: ', eval.evaluate(predictions, {eval.metricName: "f1"}))

Accuracy:  0.8335189309576837
F1 score:  0.7654409466648603


**Experiment Results**

In [None]:

# without polynomial expansion
# Accuracy:  0.839086859688196
# F1 score:  0.795258555310815

# polynomial degree = 2 and threshold =0.5
# Accuracy:  0.8385300668151447
# F1 score:  0.8120707239194876

# polynomial degree = 2 and threshold =0.4
# Accuracy:  0.8552338530066815
# F1 score:  0.85027454604292036

# polynomial degree = 3
# Accuracy:  0.8368596881959911
# F1 score:  0.8048871364559362

# Feature Hasher, num columns = 6
# Accuracy:  0.8335189309576837
# F1 score:  0.7654409466648603

# Feature Hasher, num columns = 5
# Accuracy:  0.8340757238307349
# F1 score:  0.7586189643584947