In [1]:
# importing libraries for EDA and preprocessing

import numpy as np
import pandas as pd
#from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import transformers
from transformers import BertTokenizer, BertForMaskedLM, AutoModelForMaskedLM
import torch
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler 
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import seaborn as sn

from encoder import CATENCODE, PIPELINE
import fet_sel
from hp_tune import hyperparameter_tuning

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [21]:
df=pd.read_csv('./layoffs_data.csv')
df.head()

Unnamed: 0,Company,Location,Industry,Date,Source,Funds_Raised,Stage,Date_Added,Country,Laid_Off_Count,Percentage,List_of_Employees_Laid_Off
0,Xiaomi,Beijing,Consumer,2022-12-19,https://technode.com/2022/12/19/chinese-phone-...,7400.0,IPO,2022-12-19 16:46:32,United States,,,Unknown
1,TuSimple,San Diego,Transportation,2022-12-16,https://www.wsj.com/articles/tusimple-plans-la...,648.0,IPO,2022-12-18 04:12:29,United States,700.0,0.5,Unknown
2,Tomorrow,Hamburg,Finance,2022-12-16,https://www.businessinsider.de/gruenderszene/f...,29.0,Unknown,2022-12-18 04:15:07,Germany,30.0,0.25,Unknown
3,Revelate,Montreal,Data,2022-12-16,https://betakit.com/layoffs-persist-at-canadia...,26.0,Series A,2022-12-19 17:17:13,Canada,24.0,0.3,Unknown
4,E Inc.,Toronto,Transportation,2022-12-16,https://betakit.com/layoffs-persist-at-canadia...,,IPO,2022-12-19 17:19:15,Canada,,,Unknown


### check missing values 

In [22]:
(df.isnull().sum()/len(df))*100

Company                        0.000000
Location                       0.000000
Industry                       0.000000
Date                           0.054645
Source                         0.000000
Funds_Raised                   7.377049
Stage                          0.000000
Date_Added                     0.000000
Country                        0.000000
Laid_Off_Count                30.163934
Percentage                    32.622951
List_of_Employees_Laid_Off     0.000000
dtype: float64

In [23]:
(df.isnull().sum())

Company                         0
Location                        0
Industry                        0
Date                            1
Source                          0
Funds_Raised                  135
Stage                           0
Date_Added                      0
Country                         0
Laid_Off_Count                552
Percentage                    597
List_of_Employees_Laid_Off      0
dtype: int64

In [18]:
(*135//10)

130

### check cardinality of categorical features 

In [24]:
cat_features = df.select_dtypes(exclude=["number","bool_"]).columns
for i in cat_features:
    print(f'unique values of {str(i)} :- {len(df[str(i)].unique())}')

unique values of Company :- 1521
unique values of Location :- 163
unique values of Industry :- 28
unique values of Date :- 417
unique values of Source :- 1648
unique values of Stage :- 15
unique values of Date_Added :- 1791
unique values of Country :- 55
unique values of List_of_Employees_Laid_Off :- 91


#### dropping few columns from analysis, intution is that it wont effect the analysis 

In [25]:
df.drop(columns=['Date', 'Company','Date_Added','List_of_Employees_Laid_Off','Source'], inplace=True)
df.head()

Unnamed: 0,Location,Industry,Funds_Raised,Stage,Country,Laid_Off_Count,Percentage
0,Beijing,Consumer,7400.0,IPO,United States,,
1,San Diego,Transportation,648.0,IPO,United States,700.0,0.5
2,Hamburg,Finance,29.0,Unknown,Germany,30.0,0.25
3,Montreal,Data,26.0,Series A,Canada,24.0,0.3
4,Toronto,Transportation,,IPO,Canada,,


### Check cardinality of selected data

In [26]:
cat_features = df.select_dtypes(exclude=["number","bool_"]).columns
for i in cat_features:
    print(f'unique values of {str(i)} :- {len(df[str(i)].unique())}')

unique values of Location :- 163
unique values of Industry :- 28
unique values of Stage :- 15
unique values of Country :- 55


## data impuatation 
### &nbsp;- Imputing data by training on remaining data and predicting the missing values 
#### &nbsp; Ref : https://ieeexplore.ieee.org/document/8987895
### &nbsp;- <span style='color:green'> Novel method :</span>
### &emsp;- categorical variable encoding is done by using bert word embedding 
### &emsp;- final encoded vector is max_len * 768 , but first 2 PCA compoenets are used to as features 

In [29]:
# lets filter out columns of interest for imputation 
# columns having missing data 
coi = [i for i in df.columns if df[str(i)].isna().sum()>0]
print(coi)

['Funds_Raised', 'Laid_Off_Count', 'Percentage']


In [31]:
# drop the columns with missing data 
#train a model on remaining data with dependent parameter predict will be the column with missing data
data= df.drop(columns=coi)
for col in coi:
    print(col)
    data[str(col)] = df[str(col)]
    print(f' features of new data are {data.columns}')
    data_train = data[~data[str(col)].isna()]
    data_train.reset_index(drop=True,inplace=True)
    data_test = data[data[str(col)].isna()]
    data_test.reset_index(drop=True,inplace=True)
    X_train = data_train.drop(columns=[str(col)])
    Y_train = data_train[str(col)]
    X_test = data_test.drop(columns=[str(col)])
    Y_test = data_test[str(col)]
    X_train, X_test = PIPELINE(1).cat_encode_pipeline(X_train, X_test)
    print(Y_train.isna().sum())
    features = fet_sel.feature_sel(X_train,Y_train,0.05)
    print(features)
    X_train = X_train[features]
    X_test = X_test[features]
    print(f' features used for training {X_train.columns}, test {X_test.columns}')
    # lets train Random forest to predict missing values 
    rm = RandomForestRegressor(n_estimators=500)
    rm.fit(X_train,Y_train)
    Y_pred = rm.predict(X_test)
    data_test[str(col)] = Y_pred
    data =pd.concat([data_train, data_test], axis=0)
    

Funds_Raised
 features of new data are Index(['Location', 'Industry', 'Stage', 'Country', 'Funds_Raised'], dtype='object')


Some weights of the model checkpoint at ./bert_base_uncased/ were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at ./bert_base_uncased/ were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions

0
5       Stage_2
0    Location_1
1    Location_2
2    Industry_1
Name: Features, dtype: object
 features used for training Index(['Stage_2', 'Location_1', 'Location_2', 'Industry_1'], dtype='object'), test Index(['Stage_2', 'Location_1', 'Location_2', 'Industry_1'], dtype='object')
Laid_Off_Count
 features of new data are Index(['Location', 'Industry', 'Stage', 'Country', 'Funds_Raised',
       'Laid_Off_Count'],
      dtype='object')


Some weights of the model checkpoint at ./bert_base_uncased/ were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at ./bert_base_uncased/ were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions

0
7       Country_1
2      Location_2
4      Industry_2
6         Stage_2
5         Stage_1
1      Location_1
3      Industry_1
0    Funds_Raised
Name: Features, dtype: object
 features used for training Index(['Country_1', 'Location_2', 'Industry_2', 'Stage_2', 'Stage_1',
       'Location_1', 'Industry_1', 'Funds_Raised'],
      dtype='object'), test Index(['Country_1', 'Location_2', 'Industry_2', 'Stage_2', 'Stage_1',
       'Location_1', 'Industry_1', 'Funds_Raised'],
      dtype='object')
Percentage
 features of new data are Index(['Location', 'Industry', 'Stage', 'Country', 'Funds_Raised',
       'Laid_Off_Count', 'Percentage'],
      dtype='object')


Some weights of the model checkpoint at ./bert_base_uncased/ were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at ./bert_base_uncased/ were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions

0
7           Stage_2
6           Stage_1
3        Location_2
2        Location_1
5        Industry_2
4        Industry_1
1    Laid_Off_Count
0      Funds_Raised
Name: Features, dtype: object
 features used for training Index(['Stage_2', 'Stage_1', 'Location_2', 'Location_1', 'Industry_2',
       'Industry_1', 'Laid_Off_Count', 'Funds_Raised'],
      dtype='object'), test Index(['Stage_2', 'Stage_1', 'Location_2', 'Location_1', 'Industry_2',
       'Industry_1', 'Laid_Off_Count', 'Funds_Raised'],
      dtype='object')


In [20]:
data.to_csv('imputed.csv', index=False)

## Training the model

In [32]:
# data = pd.read_csv('imputed_data.csv')
# data.drop(columns=['Unnamed: 0'],inplace=True)
(data.isnull().sum()/len(df))*100

Location          0.0
Industry          0.0
Stage             0.0
Country           0.0
Funds_Raised      0.0
Laid_Off_Count    0.0
Percentage        0.0
dtype: float64

In [44]:
X= data.drop(columns=['Percentage'])
Y= data['Percentage']
Xtrain,Xtest,Ytrain,Ytest= train_test_split(X,Y, test_size=0.2, random_state=42,shuffle=True)
Xtrain, Xtest = PIPELINE(batch_size=6).cat_encode_pipeline(Xtrain, Xtest)
features = fet_sel.feature_sel(Xtrain,Ytrain,0.06,verbose=True)
print(features)

Some weights of the model checkpoint at ./bert_base_uncased/ were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at ./bert_base_uncased/ were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions

7           Stage_2
6           Stage_1
3        Location_2
2        Location_1
5        Industry_2
4        Industry_1
1    Laid_Off_Count
0      Funds_Raised
Name: Features, dtype: object


In [35]:
features = fet_sel.feature_sel(Xtrain,Ytrain,0.07,verbose=True)
print(features)

3        Location_2
2        Location_1
5        Industry_2
4        Industry_1
1    Laid_Off_Count
0      Funds_Raised
Name: Features, dtype: object


In [45]:
Xtrain = Xtrain[features]
Xtest = Xtest[features]
print(f' features used for training {Xtrain.columns}, test {Xtest.columns}')

 features used for training Index(['Stage_2', 'Stage_1', 'Location_2', 'Location_1', 'Industry_2',
       'Industry_1', 'Laid_Off_Count', 'Funds_Raised'],
      dtype='object'), test Index(['Stage_2', 'Stage_1', 'Location_2', 'Location_1', 'Industry_2',
       'Industry_1', 'Laid_Off_Count', 'Funds_Raised'],
      dtype='object')


In [71]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

RF= RandomForestRegressor(n_estimators=500)
GB= GradientBoostingRegressor(n_estimators=2000)
XG= XGBRegressor(n_estimators=2000)
LR =LinearRegression()

models = [RF,GB,XG,LR]

for model in models: 
    
    model.fit(Xtrain,Ytrain)
    Ypred= model.predict(Xtest)
    mse= mean_squared_error(Ypred,Ytest)
    print(f'mean squared error is {mse}')
#     sn.regplot(x=Ypred, y=Ytest.values)
#     sn.regplot(x=Ypred,y=Ytest.values, fit_reg=True,color='green', marker='x',ci=100)
    fig = px.scatter(x=Ypred, y=Ytest.values, trendline='ols')
    fig.show()
    

RandomForestRegressor(n_estimators=500)

mean squared error is 0.052176869949337616


GradientBoostingRegressor(n_estimators=2000)

mean squared error is 0.06789270414018166


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.300000012, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=2000, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, ...)

mean squared error is 0.06731231072056212


LinearRegression()

mean squared error is 0.04764021495035241


## hyperparameter tuning

In [67]:
def hyperparameter_tuning(params):
    reg = (RandomForestRegressor(**params))
    cv = RepeatedKFold(n_splits=5, n_repeats=2,random_state=101)
#     acc = cross_validate(reg, Xtrain_norm, Y,scoring='r2', cv=cv)
    acc = cross_val_score(reg, Xtrain, Ytrain,scoring="neg_mean_squared_error",cv=5).mean()
    error_std = cross_val_score(reg, Xtrain, Ytrain,scoring="neg_mean_absolute_percentage_error",cv=5).std()
    
    reg.fit(Xtrain,Ytrain)
    Y_pred=reg.predict(Xtest)
    val_acc= mean_absolute_percentage_error(Ytest,Y_pred)
#     mlflow.log_metric('val_error', val_acc)
#     metrics = mlflow.sklearn.eval_and_log_metrics(MR, Xtest_norm, Ytest, prefix="val_")
    temp={'val_acc':val_acc, 'acc_mean':-acc,'error_std':error_std,'param':params}
    res.append(temp)
    pd.DataFrame(res).to_csv('./HP_tuning_sep_2n.csv', index=False)
#     print(acc)
    return {"loss": -acc, "status": STATUS_OK}

In [68]:
hyperparameter_space = { "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400,500,600]),
                         "max_depth": hp.choice('max_depth',[2,3,4,6]), 
#                          "min_child_weight": hp.choice('min_child_weight',[4,6,8,10,20]),
#                          "subsample": hp.choice('subsample',[0.5,0.75,0.8,0.9,1]),
#                         "lambda": hp.choice('lambda',[0.5,0.75, 1, 1.5,2]),
#                         "alpha": hp.choice('alpha',[0.001, 0.01, 0.1, 0.5,1]),
#                         "eta": hp.choice('eta',[0.01,0.05,0.1,0.15,0.2])
                       }

In [72]:
trials = Trials()
res=[]
argmin = fmin(
  fn=hyperparameter_tuning,
  space=hyperparameter_space,
  algo=tpe.suggest,
  max_evals=250,
  trials=trials,
  verbose=3)

100%|███████████████████████████| 250/250 [17:44<00:00,  4.26s/trial, best loss: 0.050890046854050255]


In [18]:
trials.best_trial

{'state': 2,
 'tid': 222,
 'spec': None,
 'result': {'loss': 644186750776.7102, 'status': 'ok'},
 'misc': {'tid': 222,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'alpha': [222],
   'eta': [222],
   'lambda': [222],
   'max_depth': [222],
   'min_child_weight': [222],
   'n_estimators': [222],
   'subsample': [222]},
  'vals': {'alpha': [0],
   'eta': [4],
   'lambda': [0],
   'max_depth': [3],
   'min_child_weight': [3],
   'n_estimators': [5],
   'subsample': [0]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2022, 12, 21, 5, 43, 35, 980000),
 'refresh_time': datetime.datetime(2022, 12, 21, 5, 43, 45, 503000)}