In [91]:
## TESTING AREA
import pandas as pd
import numpy as np

### function to check column values

In [92]:
birth_list = ['France', 'UK', np.nan, 'UK', 'Spain']
cit_list = ['France', 'UK', 'Switzerland', 'UK', 'Germany']
tups = list(zip(birth_list, cit_list))

In [93]:
df = pd.DataFrame(tups, columns=['birth', 'citizenship'])

In [94]:
df.head()

Unnamed: 0,birth,citizenship
0,France,France
1,UK,UK
2,,Switzerland
3,UK,UK
4,Spain,Germany


In [99]:
def ColValueCheck(col_1, col_2):
    '''Creates a new column from values of two other columns 
    dependent on whether columns have the same value
    
    Inputs: 2 columns from dataframe
    
    Assumptions: Assumes if there is a NaN value, it would be the same
    as the other column value'''
    col_1 = col_1.fillna('None')
    col_2 = col_2.fillna('None')
    
    new_col = []
    
    for idx, value in enumerate(col_1):
        if col_1[idx] == 'None' or col_2[idx] == 'None':
            new_col.append(1)
        elif col_1[idx] == col_2[idx]:
            new_col.append(1)
        else:
            new_col.append(0)
    
    return new_col

In [100]:
df['birth'][1]

'UK'

In [101]:
df['same_diff'] = ColValueCheck(df['birth'], df['citizenship'])

In [102]:
df

Unnamed: 0,birth,citizenship,same_diff
0,France,France,1
1,UK,UK,1
2,,Switzerland,1
3,UK,UK,1
4,Spain,Germany,0


In [348]:
df.columns

Index(['birth', 'citizenship', 'same_diff'], dtype='object')

In [346]:
list(df.columns)

['birth', 'citizenship', 'same_diff']

In [350]:
df.columns = list(map(lambda header : header.upper(), df.columns))

In [351]:
df.columns

Index(['BIRTH', 'CITIZENSHIP', 'SAME_DIFF'], dtype='object')

In [352]:
df.columns.str.replace('_', ' ')

Index(['BIRTH', 'CITIZENSHIP', 'SAME DIFF'], dtype='object')

---

### change states to abbreviations

In [107]:
us_state_abbrev = {
    'alabama': 'AL',
    'alaska': 'AK',
    'arizona': 'AZ',
    'arkansas': 'AR',
    'california': 'CA',
    'colorado': 'CO',
    'connecticut': 'CT',
    'delaware': 'DE',
    'district of columbia' : 'DC',
    'florida': 'FL',
    'georgia': 'GA',
    'hawaii': 'HI',
    'idaho': 'ID',
    'illinois': 'IL',
    'indiana': 'IN',
    'iowa': 'IA',
    'kansas': 'KS',
    'kentucky': 'KY',
    'louisiana': 'LA',
    'maine': 'ME',
    'maryland': 'MD',
    'massachusetts': 'MA',
    'michigan': 'MI',
    'minnesota': 'MN',
    'mississippi': 'MS',
    'missouri': 'MO',
    'montana': 'MT',
    'nebraska': 'NE',
    'nevada': 'NV',
    'new hampshire': 'NH',
    'new jersey': 'NJ',
    'new mexico': 'NM',
    'new york': 'NY',
    'north carolina': 'NC',
    'north dakota': 'ND',
    'ohio': 'OH',
    'oklahoma': 'OK',
    'oregon': 'OR',
    'pennsylvania': 'PA',
    'rhode island': 'RI',
    'south carolina': 'SC',
    'south dakota': 'SD',
    'tennessee': 'TN',
    'texas': 'TX',
    'utah': 'UT',
    'vermont': 'VT',
    'virginia': 'VA',
    'washington': 'WA',
    'west virginia': 'WV',
    'wisconsin': 'WI',
    'wyoming': 'WY',
}

In [109]:
state_list = ['ALABAMA', 'WA', np.nan, 'WASHINGTON', 'OH', 'District of Columbia', 'FL']


In [110]:
state_df = pd.DataFrame(state_list, columns=['state'])

In [125]:
state_df['state']

0                 ALABAMA
1                      WA
2                     NaN
3              WASHINGTON
4                      OH
5    District of Columbia
6                      FL
Name: state, dtype: object

In [140]:
def StateAbbreviation(state_col):
    
    '''Abbreviates any full length state names into their corresponding
    two letter code'''
    
    state_col = state_col.fillna(str('None')) # Fills NaN values with 'None'
    state_col = list(map(str, state_col)) # Make all items a string
    state_col = list(map(lambda state:state.lower(), state_col)) # Makes all items lowercase

    abbrev_list = []
    
    for item in state_col:
        
        if item in us_state_abbrev:
            abbrev_list.append(us_state_abbrev[item].upper())
        else:
            abbrev_list.append(item.upper())

    return abbrev_list

In [141]:
abbrev_list

['AL', 'WA', 'NONE', 'WA', 'OH', 'DC', 'FL']

---

### Separate letter from case number - processing centre

In [146]:
case_numbers = ['A-08271-91262', 'C-08721-94962', 'A-05671-91262', 'A-00941-91462', 'C-00471-93462']

In [147]:
case_df = pd.DataFrame(case_numbers, columns=['case_nos'])

In [148]:
case_df

Unnamed: 0,case_nos
0,A-08271-91262
1,C-08721-94962
2,A-05671-91262
3,A-00941-91462
4,C-00471-93462


In [150]:
NPC = []
for item in case_df['case_nos']:
    NPC.append(str(item)[0])

In [151]:
NPC

['A', 'C', 'A', 'A', 'C']

---

### Fill nans with column mean

In [153]:
random_list = [14,2, np.nan, 54, 23, 65, 3, np.nan, 21, 39]

In [155]:
random_df = pd.DataFrame(random_list, columns=['rand'])
random_df

Unnamed: 0,rand
0,14.0
1,2.0
2,
3,54.0
4,23.0
5,65.0
6,3.0
7,
8,21.0
9,39.0


In [160]:
# fill employer_num_employees - with mean
random_df['rand'] = random_df['rand'].fillna(random_df['rand'].mean(axis=0))
random_df

Unnamed: 0,rand
0,14.0
1,2.0
2,27.625
3,54.0
4,23.0
5,65.0
6,3.0
7,27.625
8,21.0
9,39.0


## Cleaning

### Add fiscal year 

In [163]:
## Do on import

### Case number - extract letter

In [166]:
def ProcessingCenter(df_col):
    '''Extracts the processing centre from case number'''
    nat_pros_center = []

    for item in df_col:
        nat_pros_center.append(str(item)[0])

    return nat_pros_center

In [388]:
def ProcessingCenter2(df_col):
    '''Extracts the processing centre from case number'''
    return str(df_col)[0]

In [389]:
example_df['extracted']=example_df['state'].apply(ProcessingCenter2)

In [391]:
example_df

Unnamed: 0,visa,state,rand,extracted
0,H1b,ALABAMA,14.0,A
1,H1-b,WA,2.0,W
2,H2-a,,,n
3,K-1,WASHINGTON,54.0,W
4,K-1,OH,23.0,O
5,j1,District of Columbia,65.0,D
6,,FL,3.0,F
7,H1-b,Washington,,W
8,J1,Illinois,21.0,I
9,H2,oHio,39.0,o


In [None]:
perm_to_edit['nat_processing_center'] = ProcessingCenter(perm_to_edit['case_number'])
# Drop case number column

### Case status - Y

In [265]:
# Check values of each 
values, counts = np.unique(perm_to_edit['case_number'], return_counts=True)
dict(zip(values, counts))

{'Certified': 3, 'Certified-Expired': 1, 'Denied': 1, 'Withdrawn': 2}

In [386]:
# Remove withdrawn cases 
perm_to_edit = perm_to_edit.drop(perm_to_edit[perm_to_edit['case_number'] == 'Withdrawn'].index).reset_index(drop=True)
perm_to_edit

NameError: name 'perm_to_edit' is not defined

In [None]:
for item in 

In [None]:
## CHANGE TO 1S AND 0S FOR CLASSIFICATION ##

### Clean up class of admission data

In [290]:
# Remove '-' in visa classes e.g. H1-b = H1b
class_admission = ['H1b', 'H1-b', 'H2-a', 'K-1', 'K-1', 'j1', 'j1']

class_admission = list(map(str, class_admission))
class_admission = list(map(lambda x:x.lower(),class_admission))
class_admission = [x.replace('-', '') for x in class_admission]

In [291]:
class_admission

['h1b', 'h1b', 'h2a', 'k1', 'k1', 'j1', 'j1']

In [280]:
class_admission = list(map(str, class_admission))

In [281]:
class_admission

['H1b', 'H1-b', 'H2-a', 'K-1', 'K-1', 'j1', 'j1', '1', '34', '0.2', 'True']

In [285]:
class_admission = list(map(lambda x:x.lower(),class_admission))

In [286]:
class_admission

['h1b', 'h1-b', 'h2-a', 'k-1', 'k-1', 'j1', 'j1', '1', '34', '0.2', 'true']

---

### Fill na values and drop cols not of use anymore

In [335]:
def MergeDrop(dataframe, col_to_keep, col_to_merge):
    '''Merges columns with same data and different headers
    Inputs: 
    Dataframe 
    col_to_keep - string of column name e.g. "column1"
    col_to_merge - string of column name e.g. "column3"
    '''
    
    dataframe[col_to_keep].fillna(dataframe[col_to_merge], inplace = True)
    dataframe.drop(col_to_merge, axis=1)
    return dataframe

---

## Converting lists of columns into new df

In [362]:
class_admission = ['H1b', 'H1-b', 'H2-a', 'K-1', 'K-1', 'j1', np.nan, 'H1-b', 'J1', 'H2']
state_list = ['ALABAMA', 'WA', np.nan, 'WASHINGTON', 'OH', 'District of Columbia', 'FL', 'Washington', 'Illinois', 'oHio']
random_list = [14,2, np.nan, 54, 23, 65, 3, np.nan, 21, 39]

In [363]:
for_df = list(zip(class_admission, state_list, random_list))

In [364]:
example_df = pd.DataFrame(for_df, columns=['visa', 'state', 'rand'])

In [365]:
example_df

Unnamed: 0,visa,state,rand
0,H1b,ALABAMA,14.0
1,H1-b,WA,2.0
2,H2-a,,
3,K-1,WASHINGTON,54.0
4,K-1,OH,23.0
5,j1,District of Columbia,65.0
6,,FL,3.0
7,H1-b,Washington,
8,J1,Illinois,21.0
9,H2,oHio,39.0


In [481]:
example_df['rand'] = list(map(str, example_df['rand']))

In [482]:
nan_num = len(example_df) - example_df['rand'].count()

In [483]:
nan_num

0

In [382]:
new_df_list = []

In [383]:
# append columns to new list
new_df_list.append(example_df['state'])
new_df_list.append(example_df['rand'])

In [384]:
# create new df from list of columns & transpose
new_df = pd.DataFrame(new_df_list)
new_df = new_df.transpose()

In [385]:
new_df

Unnamed: 0,state,rand
0,ALABAMA,14.0
1,WA,2.0
2,,
3,WASHINGTON,54.0
4,OH,23.0
5,District of Columbia,65.0
6,FL,3.0
7,Washington,
8,Illinois,21.0
9,oHio,39.0


In [None]:
# Dealing with pay
# Find out whether it's hourly or salary 
# if hourly - multiply by 2080
# if bi weekly - multiply by 26
# if monthly - multiply by 12
# if weekly - multiply by 52
# if none 

In [474]:
wage = [6.5, '100,000', '500', 7.23, 45.89, 0]
how = ['hourly', 'year', 'bi-weekly', 'hourly', 'HR', 'hour']
wage_df = (pd.DataFrame(list(zip(wage, how)), columns=['wage', 'how']))

In [475]:
wage_df

Unnamed: 0,wage,how
0,6.5,hourly
1,100000.0,year
2,500.0,bi-weekly
3,7.23,hourly
4,45.89,HR
5,0.0,hour


In [476]:
def WageFunction(col1, col2):
    
    # Clean col1
    col1 = list(map(str, col1))
    col1 = [x.replace(',', '') for x in col1]
    col1 = [x.replace('#', '0') for x in col1]
    col1 = list(map(float, col1))
    
    # Clean col2
    col2 = list(map(lambda x:x.lower(), col2))
    col2 = [i[0] for i in col2]
    col2 = list(map(str, col2))
    
    tups = list(zip(col1, col2))
    
    total_salary = []
    
    for idx, value in enumerate(tups):
        if tups[idx][1] == 'h':
            total_salary.append((tups[idx][0]*2080))
        elif tups[idx][1] == 'w':
            total_salary.append((tups[idx][0]*52))
        elif tups[idx][1] == 'b':
            total_salary.append((tups[idx][0]*26))
        elif tups[idx][1] == 'm':
            total_salary.append((tups[idx][0]*12))
        else:
            total_salary.append(tups[idx][0])
    
    rounded_list = [ '%.2f' % elem for elem in total_salary]
    rounded_list = list(map(float, rounded_list))
    rounded_list = [59039 if x == 0 else x for x in rounded_list]
    
    return rounded_list
        

In [477]:
wage_df['year_sal'] = WageFunction(wage_df['wage'], wage_df['how'])

In [478]:
wage_df

Unnamed: 0,wage,how,year_sal
0,6.5,hourly,13520.0
1,100000.0,year,100000.0
2,500.0,bi-weekly,13000.0
3,7.23,hourly,15038.4
4,45.89,HR,95451.2
5,0.0,hour,59039.0


In [466]:
tuples

[13520.0, 100000.0, 13000.0, 15038.4, 95451.2, 0.0]

In [405]:
list_of_tups = [(6.5, 'hourly'), ('100,000', 'year')]

In [440]:
list_of_tups[0][0]

6.5

In [479]:
# Check values
values, counts = np.unique(perm_df['case_status'], return_counts=True)
dict(zip(values, counts))

NameError: name 'perm_df' is not defined