In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
import warnings
warnings.filterwarnings(action = 'ignore')

In [3]:
preprocessed_data3_path = r'D:\H1B project\Final_dataset'
preprocessed_data3 = pd.read_csv(os.path.join(preprocessed_data3_path,'preprocessed_dataset2.csv'))
pd.set_option("display.max_columns", None)

In [4]:
preprocessed_data3

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
0,0,1,1,1,34,0,0,14,2.998,0.019809
1,0,1,1,1,4,0,0,9,2.998,0.081801
2,0,1,1,0,15,0,0,19,2.998,0.004135
3,0,1,1,0,48,0,0,7,2.998,-0.089944
4,0,1,1,1,15,0,0,14,2.998,0.028290
...,...,...,...,...,...,...,...,...,...,...
2896706,0,1,1,1,19,0,0,25,2.998,-0.072902
2896707,0,1,1,1,19,0,0,25,2.998,-0.071095
2896708,2,1,6,0,9,0,0,25,1.999,-0.057561
2896709,0,1,6,0,9,0,0,25,2.998,-0.057561


#### Balncing this data by only reducing category 1 to 2.5 lakh rows and not up-sampling the remaining categories to avoid duplicacy in the data, as resampling creates duplicate rows.

In [5]:
preprocessed_data3['CASE_STATUS'].value_counts()

CASE_STATUS
0    2692035
1     130320
3      58194
2      16162
Name: count, dtype: int64

In [6]:
df_certified = preprocessed_data3[preprocessed_data3["CASE_STATUS"] == 0]
df_certified

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
0,0,1,1,1,34,0,0,14,2.998,0.019809
1,0,1,1,1,4,0,0,9,2.998,0.081801
2,0,1,1,0,15,0,0,19,2.998,0.004135
3,0,1,1,0,48,0,0,7,2.998,-0.089944
4,0,1,1,1,15,0,0,14,2.998,0.028290
...,...,...,...,...,...,...,...,...,...,...
2896705,0,1,1,1,19,0,0,25,2.998,-0.071095
2896706,0,1,1,1,19,0,0,25,2.998,-0.072902
2896707,0,1,1,1,19,0,0,25,2.998,-0.071095
2896709,0,1,6,0,9,0,0,25,2.998,-0.057561


### Shuffle the data properly before downsampling

In [7]:
df_certified = df_certified.sample(frac = 1, random_state = 42)
df_certified = df_certified.reset_index(drop = True)  # Drop = True discards the old index
df_certified

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
0,0,1,1,0,34,1,0,2,1.996,-0.042146
1,0,1,1,1,41,1,0,14,2.998,0.021800
2,0,1,1,0,4,1,0,14,2.998,-0.002134
3,0,1,1,1,38,1,0,25,2.998,-0.057451
4,0,1,1,1,15,0,0,2,2.998,0.003472
...,...,...,...,...,...,...,...,...,...,...
2692030,0,1,1,1,34,0,0,3,2.998,-0.056381
2692031,0,1,1,1,53,0,0,14,2.996,0.027074
2692032,0,1,1,1,4,0,0,14,2.998,0.019736
2692033,0,1,1,0,4,1,0,14,2.992,-0.005547


#### Downsampling the certified category

In [8]:
from sklearn.utils import resample

df_down_sampled_certified = resample(df_certified, n_samples=200000, random_state=42)
df_down_sampled_certified

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
2219110,0,1,1,1,37,1,0,14,2.996,-0.002687
2229084,0,1,1,0,41,1,0,2,2.998,0.007307
2356330,0,1,1,0,10,1,0,14,2.998,-0.031930
1692743,0,1,1,1,15,1,0,14,2.998,-0.050481
110268,0,1,1,1,4,0,0,2,2.550,0.038100
...,...,...,...,...,...,...,...,...,...,...
537631,0,1,1,0,4,1,0,14,2.996,-0.005784
2483853,0,1,1,0,9,1,0,14,2.996,-0.049116
1197213,0,1,1,1,37,0,0,2,2.998,0.021062
2641836,0,1,1,1,4,0,0,8,2.998,0.076822


#### Having dataframe without certified category

In [9]:
df_remaining = preprocessed_data3[preprocessed_data3["CASE_STATUS"] != 0]
df_remaining

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
84917,1,1,1,1,48,0,0,3,2.998,-0.025292
84918,1,1,1,0,45,0,0,7,2.998,-0.043976
84919,1,1,1,1,50,0,0,8,2.998,0.014477
84920,1,1,1,0,45,0,0,25,2.998,-0.089866
84921,1,1,1,1,17,0,0,14,2.998,-0.002318
...,...,...,...,...,...,...,...,...,...,...
2896670,2,0,1,1,48,0,0,25,3.000,-0.082701
2896673,2,1,1,1,12,0,0,25,2.998,-0.038900
2896678,2,1,1,0,15,1,1,25,2.002,-0.077771
2896685,1,1,1,1,19,0,0,25,2.998,-0.068255


In [10]:
balanced_dataset = pd.concat([df_remaining, df_down_sampled_certified])
balanced_dataset

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
84917,1,1,1,1,48,0,0,3,2.998,-0.025292
84918,1,1,1,0,45,0,0,7,2.998,-0.043976
84919,1,1,1,1,50,0,0,8,2.998,0.014477
84920,1,1,1,0,45,0,0,25,2.998,-0.089866
84921,1,1,1,1,17,0,0,14,2.998,-0.002318
...,...,...,...,...,...,...,...,...,...,...
537631,0,1,1,0,4,1,0,14,2.996,-0.005784
2483853,0,1,1,0,9,1,0,14,2.996,-0.049116
1197213,0,1,1,1,37,0,0,2,2.998,0.021062
2641836,0,1,1,1,4,0,0,8,2.998,0.076822


### Shuffle data again

In [11]:
balanced_dataset = balanced_dataset.sample(frac = 1, random_state = 42)
balanced_dataset = balanced_dataset.reset_index(drop = True)  # Drop = True discards the old index
balanced_dataset

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
0,3,1,1,0,4,1,0,2,2.996,-0.044558
1,0,1,1,1,48,0,0,14,2.998,-0.013050
2,0,1,1,0,34,1,0,2,2.998,-0.010063
3,0,1,1,1,4,0,0,14,2.998,0.168132
4,0,1,1,1,2,1,0,14,2.998,-0.038274
...,...,...,...,...,...,...,...,...,...,...
404671,0,1,1,0,54,1,0,14,2.998,-0.037758
404672,0,1,1,1,18,0,0,14,2.998,-0.018176
404673,3,1,1,1,15,1,0,14,2.998,-0.051399
404674,1,1,1,1,36,0,0,8,2.998,-0.040487


In [12]:
balanced_data_path = r'D:\H1B project\Final_dataset\model_dataset2.csv'
balanced_dataset.to_csv(balanced_data_path, index = False)