In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
import warnings
warnings.filterwarnings(action = 'ignore')

In [3]:
preprocessed_data2_path = r'D:\H1B project\Final_dataset'
preprocessed_data2 = pd.read_csv(os.path.join(preprocessed_data2_path,'tableau_dataset1.csv'))
pd.set_option("display.max_columns", None)

In [4]:
preprocessed_data2.head()

Unnamed: 0,CASE_STATUS,JOB_TITLE,FULL_TIME_POSITION,EMPLOYMENT_START_DATE,EMPLOYMENT_END_DATE,TOTAL_WORKER_POSITIONS,EMPLOYER_NAME,EMPLOYER_CITY,EMPLOYER_STATE,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,PREVAILING_WAGE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION
0,CERTIFIED,DEVOPS ENGINEER II,Y,2023-03-18,2026-03-17,1,"Insurance Services Office, Inc.",Jersey City,NJ,Y,NJ,109283,N,N,IT
1,CERTIFIED,FE ENGINEER,Y,2023-06-19,2026-06-18,1,Apple Inc.,Cupertino,CA,Y,CA,144248,N,N,ELECTRONICS ENGINEERS
2,CERTIFIED,CONTROL SYS ENGINEER,Y,2023-01-16,2026-01-15,1,Caterpillar Inc,Irving,TX,N,IL,100443,N,N,MECHANICAL ENGINEERS
3,CERTIFIED,SENIOR LECTURER,Y,2023-03-15,2026-03-14,1,University of Texas at Arlington,Arlington,TX,N,TX,47380,N,N,EDUCATION
4,CERTIFIED,PRINCIPAL PAYMENT TECH DEVELOPMENT,Y,2023-01-03,2026-01-02,1,DFS Services LLC,Riverwoods,IL,Y,IL,114067,N,N,IT


In [5]:
preprocessed_data2['EMPLOYMENT_START_DATE'] = pd.to_datetime(preprocessed_data2['EMPLOYMENT_START_DATE'], format = "%Y-%m-%d")
preprocessed_data2['EMPLOYMENT_END_DATE'] = pd.to_datetime(preprocessed_data2['EMPLOYMENT_END_DATE'], format = "%Y-%m-%d")

preprocessed_data2['EMPLOYMENT_DURATION_YEARS'] = ((preprocessed_data2['EMPLOYMENT_END_DATE'] - preprocessed_data2['EMPLOYMENT_START_DATE']).dt.days/365.25).astype('float16')

preprocessed_data2["EMPLOYMENT_DURATION_YEARS"].value_counts()

EMPLOYMENT_DURATION_YEARS
2.998047    1922442
2.996094     558828
3.000000     165716
2.992188      50668
2.990234      12282
             ...   
3.019531          1
3.044922          1
3.189453          1
3.714844          1
3.093750          1
Name: count, Length: 1106, dtype: int64

In [6]:
preprocessed_data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2896711 entries, 0 to 2896710
Data columns (total 16 columns):
 #   Column                       Dtype         
---  ------                       -----         
 0   CASE_STATUS                  object        
 1   JOB_TITLE                    object        
 2   FULL_TIME_POSITION           object        
 3   EMPLOYMENT_START_DATE        datetime64[ns]
 4   EMPLOYMENT_END_DATE          datetime64[ns]
 5   TOTAL_WORKER_POSITIONS       int64         
 6   EMPLOYER_NAME                object        
 7   EMPLOYER_CITY                object        
 8   EMPLOYER_STATE               object        
 9   AGENT_REPRESENTING_EMPLOYER  object        
 10  WORKSITE_STATE               object        
 11  PREVAILING_WAGE              int64         
 12  H1B_DEPENDENT                object        
 13  WILLFUL_VIOLATOR             object        
 14  OCCUPATION                   object        
 15  EMPLOYMENT_DURATION_YEARS    float16       
dtype

### Dropping JOB_TITLE as OCCUPATION column justifies about jobs more as compared to JOB_TITLE
### Dropping EMPLOYER_CITY as EMPLOYER_STATE column is already present.

In [7]:
preprocessed_data2 = preprocessed_data2.drop(['JOB_TITLE','EMPLOYER_CITY','EMPLOYMENT_START_DATE','EMPLOYMENT_END_DATE', 'EMPLOYER_NAME', 'EMPLOYER_STATE'], axis = 1)
preprocessed_data2.head()

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,PREVAILING_WAGE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS
0,CERTIFIED,Y,1,Y,NJ,109283,N,N,IT,2.998047
1,CERTIFIED,Y,1,Y,CA,144248,N,N,ELECTRONICS ENGINEERS,2.998047
2,CERTIFIED,Y,1,N,IL,100443,N,N,MECHANICAL ENGINEERS,2.998047
3,CERTIFIED,Y,1,N,TX,47380,N,N,EDUCATION,2.998047
4,CERTIFIED,Y,1,Y,IL,114067,N,N,IT,2.998047


In [8]:
preprocessed_data2.isnull().sum()

CASE_STATUS                    0
FULL_TIME_POSITION             0
TOTAL_WORKER_POSITIONS         0
AGENT_REPRESENTING_EMPLOYER    0
WORKSITE_STATE                 0
PREVAILING_WAGE                0
H1B_DEPENDENT                  0
WILLFUL_VIOLATOR               0
OCCUPATION                     0
EMPLOYMENT_DURATION_YEARS      0
dtype: int64

In [9]:
object_columns = preprocessed_data2.select_dtypes(include=['object']).columns.tolist()
object_columns = [ column for column in preprocessed_data2.columns if preprocessed_data2[column].dtype == object]

In [10]:
object_columns

['CASE_STATUS',
 'FULL_TIME_POSITION',
 'AGENT_REPRESENTING_EMPLOYER',
 'WORKSITE_STATE',
 'H1B_DEPENDENT',
 'WILLFUL_VIOLATOR',
 'OCCUPATION']

In [11]:
preprocessed_data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2896711 entries, 0 to 2896710
Data columns (total 10 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   CASE_STATUS                  object 
 1   FULL_TIME_POSITION           object 
 2   TOTAL_WORKER_POSITIONS       int64  
 3   AGENT_REPRESENTING_EMPLOYER  object 
 4   WORKSITE_STATE               object 
 5   PREVAILING_WAGE              int64  
 6   H1B_DEPENDENT                object 
 7   WILLFUL_VIOLATOR             object 
 8   OCCUPATION                   object 
 9   EMPLOYMENT_DURATION_YEARS    float16
dtypes: float16(1), int64(2), object(7)
memory usage: 204.4+ MB


In [12]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for column in object_columns:
    preprocessed_data2[column] = label_encoder.fit_transform(preprocessed_data2[column])

In [13]:
preprocessed_data2.head()

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,PREVAILING_WAGE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS
0,0,1,1,1,34,109283,0,0,14,2.998047
1,0,1,1,1,4,144248,0,0,9,2.998047
2,0,1,1,0,15,100443,0,0,19,2.998047
3,0,1,1,0,48,47380,0,0,7,2.998047
4,0,1,1,1,15,114067,0,0,14,2.998047


In [14]:
preprocessed_data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2896711 entries, 0 to 2896710
Data columns (total 10 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   CASE_STATUS                  int32  
 1   FULL_TIME_POSITION           int32  
 2   TOTAL_WORKER_POSITIONS       int64  
 3   AGENT_REPRESENTING_EMPLOYER  int32  
 4   WORKSITE_STATE               int32  
 5   PREVAILING_WAGE              int64  
 6   H1B_DEPENDENT                int32  
 7   WILLFUL_VIOLATOR             int32  
 8   OCCUPATION                   int32  
 9   EMPLOYMENT_DURATION_YEARS    float16
dtypes: float16(1), int32(7), int64(2)
memory usage: 127.1 MB


In [15]:
preprocessed_data2.corr()["CASE_STATUS"]

CASE_STATUS                    1.000000
FULL_TIME_POSITION            -0.017154
TOTAL_WORKER_POSITIONS        -0.019881
AGENT_REPRESENTING_EMPLOYER   -0.018040
WORKSITE_STATE                -0.009338
PREVAILING_WAGE                0.010832
H1B_DEPENDENT                 -0.027561
WILLFUL_VIOLATOR               0.013267
OCCUPATION                     0.025416
EMPLOYMENT_DURATION_YEARS     -0.048508
Name: CASE_STATUS, dtype: float64

### We have decided to build model on all the above columns 

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
scaler = StandardScaler()
wage_scaled = scaler.fit_transform(pd.DataFrame(preprocessed_data2["PREVAILING_WAGE"]))
wage_scaled

array([[ 0.01980852],
       [ 0.08180055],
       [ 0.00413543],
       ...,
       [-0.05756051],
       [-0.05756051],
       [-0.03967475]])

In [18]:
wage_scaled_df = pd.DataFrame(wage_scaled, columns = ['PREVAILING_WAGE_SCALED'])
wage_scaled_df

Unnamed: 0,PREVAILING_WAGE_SCALED
0,0.019809
1,0.081801
2,0.004135
3,-0.089944
4,0.028290
...,...
2896706,-0.072902
2896707,-0.071095
2896708,-0.057561
2896709,-0.057561


In [19]:
preprocessed_data2 = preprocessed_data2.drop(['PREVAILING_WAGE'], axis = 1)
preprocessed_data2 = pd.concat([preprocessed_data2,wage_scaled_df], axis = 1)
preprocessed_data2

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
0,0,1,1,1,34,0,0,14,2.998047,0.019809
1,0,1,1,1,4,0,0,9,2.998047,0.081801
2,0,1,1,0,15,0,0,19,2.998047,0.004135
3,0,1,1,0,48,0,0,7,2.998047,-0.089944
4,0,1,1,1,15,0,0,14,2.998047,0.028290
...,...,...,...,...,...,...,...,...,...,...
2896706,0,1,1,1,19,0,0,25,2.998047,-0.072902
2896707,0,1,1,1,19,0,0,25,2.998047,-0.071095
2896708,2,1,6,0,9,0,0,25,1.999023,-0.057561
2896709,0,1,6,0,9,0,0,25,2.998047,-0.057561


#### Saving this dataset as preprocessed_dataset2

In [20]:
preprocessed_data_path = r'D:\H1B project\Final_dataset\preprocessed_dataset2.csv'
preprocessed_data2.to_csv(preprocessed_data_path, index = False)

In [21]:
preprocessed_data2['CASE_STATUS'].value_counts()

CASE_STATUS
0    2692035
1     130320
3      58194
2      16162
Name: count, dtype: int64

### Our Data is highly imbalanced. We need to balance data properly to avoid over fitting of the model

#### Points taken into consideration for balancing the data
- No category should not be oversampled by more than 50% of its original count to maintain the authenticity of the data.
- Total count of 3 categories can be kept equal to the count of category 0.
- Data should be shuffeled properly for proper balancing.

In [22]:
df_certified = preprocessed_data2[preprocessed_data2["CASE_STATUS"] == 0]
df_certified

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
0,0,1,1,1,34,0,0,14,2.998047,0.019809
1,0,1,1,1,4,0,0,9,2.998047,0.081801
2,0,1,1,0,15,0,0,19,2.998047,0.004135
3,0,1,1,0,48,0,0,7,2.998047,-0.089944
4,0,1,1,1,15,0,0,14,2.998047,0.028290
...,...,...,...,...,...,...,...,...,...,...
2896705,0,1,1,1,19,0,0,25,2.998047,-0.071095
2896706,0,1,1,1,19,0,0,25,2.998047,-0.072902
2896707,0,1,1,1,19,0,0,25,2.998047,-0.071095
2896709,0,1,6,0,9,0,0,25,2.998047,-0.057561


In [23]:
from sklearn.utils import resample

df_down_sampled_certified = resample(df_certified, n_samples=250000, random_state=42)
df_down_sampled_certified

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
2388170,0,1,10,1,5,0,0,2,2.998047,-0.041777
2398350,0,1,1,0,4,1,0,14,2.992188,-0.005547
2537079,0,1,1,1,4,0,0,14,2.998047,0.019736
1832433,0,1,1,1,53,0,0,14,2.996094,0.027074
122329,0,1,1,1,34,0,0,3,2.998047,-0.056381
...,...,...,...,...,...,...,...,...,...,...
1260657,0,1,1,1,4,0,0,9,2.996094,0.105220
113902,0,1,1,1,53,0,0,14,2.998047,0.051820
805933,0,1,1,1,48,0,0,6,2.998047,-0.015188
1365184,0,1,1,1,7,0,0,23,0.613281,-0.025182


In [24]:
df_certified_withdrawn = preprocessed_data2[preprocessed_data2["CASE_STATUS"] == 1]
df_certified_withdrawn

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
84917,1,1,1,1,48,0,0,3,2.998047,-0.025292
84918,1,1,1,0,45,0,0,7,2.998047,-0.043976
84919,1,1,1,1,50,0,0,8,2.998047,0.014477
84920,1,1,1,0,45,0,0,25,2.998047,-0.089866
84921,1,1,1,1,17,0,0,14,2.998047,-0.002318
...,...,...,...,...,...,...,...,...,...,...
2896510,1,1,1,0,48,0,0,25,2.998047,-0.107567
2896543,1,1,1,1,47,0,0,25,2.998047,-0.104470
2896544,1,1,1,1,5,0,0,25,2.998047,-0.104470
2896554,1,1,1,0,15,0,0,25,3.000000,-0.097132


In [25]:
from sklearn.utils import resample

df_up_sampled_certified_withdrawn = resample(df_certified_withdrawn, n_samples=170000, random_state=42)
df_up_sampled_certified_withdrawn

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
2704197,1,1,1,1,30,1,0,14,3.000000,-0.031930
226625,1,1,1,1,40,0,0,17,2.998047,-0.041262
85777,1,1,1,1,4,0,0,14,2.957031,-0.063608
2111583,1,1,1,1,48,1,0,14,2.996094,-0.006744
2847795,1,1,1,1,15,0,0,10,2.998047,-0.079577
...,...,...,...,...,...,...,...,...,...,...
769799,1,1,1,1,37,0,0,2,2.998047,-0.011944
1485161,1,1,1,1,48,0,0,14,2.998047,0.051486
747044,1,1,1,1,4,0,0,14,2.996094,0.087627
212045,1,1,1,1,34,1,0,2,2.597656,-0.006228


In [26]:
df_withdrawn = preprocessed_data2[preprocessed_data2['CASE_STATUS'] == 3]
df_withdrawn

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
93450,3,1,1,1,48,0,0,9,2.998047,0.018665
93451,3,1,1,1,48,0,0,14,2.998047,-0.007759
93452,3,1,1,1,4,1,0,9,2.998047,0.050270
93453,3,1,1,0,38,0,0,14,2.998047,0.008852
93454,3,1,1,1,48,1,0,9,2.998047,0.018665
...,...,...,...,...,...,...,...,...,...,...
2896362,3,1,1,1,55,0,0,25,2.998047,-0.116492
2896549,3,1,1,0,20,0,0,25,2.998047,-0.099564
2896572,3,1,10,1,9,0,0,25,2.001953,-0.103312
2896649,3,0,5,0,9,0,0,25,2.996094,-0.147140


In [27]:
from sklearn.utils import resample

df_up_sampled_withdrawn = resample(df_withdrawn, n_samples=75000, random_state=42)
df_up_sampled_withdrawn

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
2838613,3,1,1,1,15,0,0,21,2.998047,-0.087322
898999,3,1,1,1,4,0,0,14,2.998047,0.058124
94310,3,1,1,0,47,1,0,14,2.998047,-0.080056
2087956,3,0,1,0,37,0,0,7,2.996094,0.066468
2771977,3,1,1,1,4,0,0,2,3.000000,0.004247
...,...,...,...,...,...,...,...,...,...,...
2003621,3,1,1,1,15,0,0,17,2.996094,-0.046940
394732,3,1,1,0,34,1,0,14,2.998047,0.003213
1906425,3,1,1,1,50,0,0,14,2.998047,-0.027948
991005,3,1,1,1,34,0,0,19,2.998047,0.008598


In [28]:
df_denied = preprocessed_data2[preprocessed_data2['CASE_STATUS'] == 2]
df_denied

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
93151,2,1,2,0,30,0,0,25,2.998047,-0.138103
93152,2,1,1,1,25,0,0,2,2.998047,-0.033148
93153,2,0,1,1,3,0,0,19,1.999023,-0.091993
93154,2,1,1,0,48,0,0,5,2.998047,-0.040781
93155,2,1,1,0,4,0,0,14,2.998047,0.055897
...,...,...,...,...,...,...,...,...,...,...
2896663,2,1,1,1,14,0,0,25,3.000000,-0.030751
2896670,2,0,1,1,48,0,0,25,3.000000,-0.082701
2896673,2,1,1,1,12,0,0,25,2.998047,-0.038900
2896678,2,1,1,0,15,1,1,25,2.001953,-0.077771


In [29]:
from sklearn.utils import resample

df_up_sampled_denied = resample(df_denied, n_samples=22000, random_state=42)
df_up_sampled_denied

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
1633887,2,1,1,0,37,0,0,25,2.998047,-0.071115
2881501,2,1,1,1,54,0,0,21,2.998047,0.194831
261748,2,1,1,1,30,0,0,19,2.998047,-0.040965
1313438,2,1,1,1,25,0,0,9,2.996094,0.038359
2591715,2,1,1,1,47,0,0,14,2.998047,-0.033849
...,...,...,...,...,...,...,...,...,...,...
1178723,2,1,1,0,13,0,0,25,2.998047,-0.075005
1941217,2,1,1,0,5,0,0,17,2.996094,0.047689
2426911,2,1,1,0,30,1,0,14,2.998047,-0.050332
527768,2,1,1,1,15,0,0,17,2.998047,-0.078912


In [30]:
balanced_dataset = pd.concat([df_down_sampled_certified, df_up_sampled_certified_withdrawn, df_up_sampled_withdrawn, df_up_sampled_denied])
balanced_dataset

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
2388170,0,1,10,1,5,0,0,2,2.998047,-0.041777
2398350,0,1,1,0,4,1,0,14,2.992188,-0.005547
2537079,0,1,1,1,4,0,0,14,2.998047,0.019736
1832433,0,1,1,1,53,0,0,14,2.996094,0.027074
122329,0,1,1,1,34,0,0,3,2.998047,-0.056381
...,...,...,...,...,...,...,...,...,...,...
1178723,2,1,1,0,13,0,0,25,2.998047,-0.075005
1941217,2,1,1,0,5,0,0,17,2.996094,0.047689
2426911,2,1,1,0,30,1,0,14,2.998047,-0.050332
527768,2,1,1,1,15,0,0,17,2.998047,-0.078912


#### Shuffling the dataframe for the model to train properly

In [31]:
balanced_dataset = balanced_dataset.sample(frac = 1, random_state = 42)
balanced_dataset = balanced_dataset.reset_index(drop = True)  # Drop = True discards the old index
balanced_dataset[:60]

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
0,3,1,1,1,4,0,0,9,2.998047,0.096477
1,3,1,1,1,48,0,0,17,2.998047,-0.004162
2,0,1,1,1,30,0,0,21,2.880859,0.196085
3,1,1,1,1,34,0,0,14,2.998047,-0.007738
4,1,1,1,1,48,0,0,14,2.998047,0.001407
5,0,1,1,1,37,0,0,17,2.998047,-0.001618
6,0,1,1,1,4,0,0,14,2.998047,0.082648
7,1,1,1,1,36,0,0,2,2.996094,-0.065674
8,1,1,1,1,34,1,0,14,2.998047,-0.015704
9,0,1,1,0,48,1,0,14,2.998047,0.001407


In [32]:
balanced_dataset

Unnamed: 0,CASE_STATUS,FULL_TIME_POSITION,TOTAL_WORKER_POSITIONS,AGENT_REPRESENTING_EMPLOYER,WORKSITE_STATE,H1B_DEPENDENT,WILLFUL_VIOLATOR,OCCUPATION,EMPLOYMENT_DURATION_YEARS,PREVAILING_WAGE_SCALED
0,3,1,1,1,4,0,0,9,2.998047,0.096477
1,3,1,1,1,48,0,0,17,2.998047,-0.004162
2,0,1,1,1,30,0,0,21,2.880859,0.196085
3,1,1,1,1,34,0,0,14,2.998047,-0.007738
4,1,1,1,1,48,0,0,14,2.998047,0.001407
...,...,...,...,...,...,...,...,...,...,...
516995,1,1,1,1,20,0,0,7,2.996094,-0.051612
516996,1,1,1,0,38,0,0,7,2.996094,0.085132
516997,0,1,1,1,37,0,0,7,2.998047,-0.014380
516998,0,1,1,1,19,0,0,21,2.998047,0.119969


In [33]:
balanced_data_path = r'D:\H1B project\Final_dataset\model_dataset1.csv'
balanced_dataset.to_csv(balanced_data_path, index = False)