In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

In [2]:
df = pd.read_csv("Zeta_Analytics Dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18359 entries, 0 to 18358
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             18359 non-null  int64  
 1   city                    18359 non-null  object 
 2   city_development_index  18359 non-null  float64
 3   gender                  14261 non-null  object 
 4   relevent_experience     18359 non-null  object 
 5   enrolled_university     18017 non-null  object 
 6   education_level         17902 non-null  object 
 7   major_discipline        15521 non-null  object 
 8   experience              18300 non-null  object 
 9   company_size            13580 non-null  object 
 10  company_type            13320 non-null  object 
 11  last_new_job            17992 non-null  object 
 12  training_hours          18359 non-null  int64  
 13  target                  18359 non-null  int64  
dtypes: float64(1), int64(3), object(10)
me

## Data Preprocessing

In [3]:
df.drop(["city", "enrollee_id"], axis = 1, inplace = True)

In [4]:
cat_cols = df.select_dtypes("O").columns
num_cols = [col for col in df.columns if col not in cat_cols]
cat_na = [col for col in cat_cols if df[col].isnull().any()]
num_na = [col for col in num_cols if df[col].isnull().any()]

In [5]:
df[cat_na] = df[cat_na].fillna("NA")


In [6]:
map_company_size = {"50-99" : 3,
                    "100-500" : 4,
                    "10000+": 8,
                    "10/49  " : 2,
                    "1000-4999" : 6,
                    "<10" : 1  , 
                    "500-999"  : 5,
                    "5000-9999" : 7,
                    "NA": 0}
df["company_size"] = df["company_size"].map(map_company_size)

tx = ["experience","last_new_job"]
for col in tx:
    df[col] = df[col].astype("str").str.replace(">","").replace("<","").replace("never",0).replace("<1",0)
    print(df[col].value_counts())

20    3583
5     1309
4     1250
3     1159
6     1125
2      992
9      979
10     967
7      950
8      755
15     695
11     667
14     602
16     549
12     497
1      452
<1     416
13     412
17     347
19     308
18     286
NA      59
Name: experience, dtype: int64
1     7567
4     4377
2     2835
0     2186
3     1027
NA     367
Name: last_new_job, dtype: int64


In [15]:
df["last_new_job"].value_counts()

1     7567
4     4377
2     2835
0     2186
3     1027
NA     367
Name: last_new_job, dtype: int64

In [16]:
df["education_level"].value_counts()
map_edu_levels = {"Primary School": 1,
                  "High School": 2,
                  "Graduate": 3,
                  "Masters": 4,
                  "Phd": 5,
                  "NA": 0}

In [17]:
X = df.drop("target", axis = 1)
y = df["target"]

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

In [19]:
exp_median = df["experience"].median()
last_new_job_median = df["last_new_job"].median()
## inputing numeric null values after train test split to avoid data leakage
X_train["experience"]  = X_train["experience"].fillna(exp_median)
X_train["last_new_job"] = X_train["last_new_job"].fillna(last_new_job_median)

#test data
X_test["experience"]  = X_test["experience"].fillna(exp_median)
X_test["last_new_job"] = X_test["last_new_job"].fillna(last_new_job_median)

TypeError: could not convert string to float: '<1'

In [None]:
df.target.value_counts()

0    15934
1     2425
Name: target, dtype: int64

### Define classes for our dataset and neural network

In [None]:
import torch.utils
import torch.utils.data


def evaluate_and_train_nn_model(model, optimzer, batch_size = 128 , epochs = 25, print_every = 5)
    model.train() #setting model to train

    train_loss_list = []
    test_loss_list = []
    train_auc_list = []
    test_auc_list = [] #AUC score is used as evaluation metric as target data is imbalanced
    
    ##Convert tensor data into torch based objects
    train_dataset = JobChange 