## Load packages

In [1]:
import pandas as pd
import numpy as np
import random
import re
import os

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



## Load and inspect debar data 

In [4]:
debar = pd.read_csv("debar.csv", low_memory=False)

debar.head()
debar.shape
debar.info

Unnamed: 0,Name,"City, State",Violation,Duration,Start date,End date
0,J&J Harvesting,"Leads, ND",Failure to respond to audit (partial response),2 years,1/19/2014,1/18/2016
1,"Stahlman Apiaries, Inc","Selby, SD",Failure to respond to audit (partial response),1 year,2/19/2015,2/14/2016
2,Trust Nursery,"Pulaski, NY",Failure to respond to audit (partial response),1 year,3/21/2014,3/20/2015
3,Anton Fertilizer Inc.,"Dighton, KS",Failure to respond to audit (no response),2 years,3/30/2014,3/29/2016
4,"Great Plains Fluid Service, Inc.","Greensburg, KS",Failure to respond to audit (no response),2 years,3/30/2014,3/29/2016


(114, 6)

<bound method DataFrame.info of                                             Name       City, State  \
0                                 J&J Harvesting         Leads, ND   
1                         Stahlman Apiaries, Inc         Selby, SD   
2                                  Trust Nursery       Pulaski, NY   
3                          Anton Fertilizer Inc.       Dighton, KS   
4               Great Plains Fluid Service, Inc.    Greensburg, KS   
..                                           ...               ...   
109                             Dove Creek Farms  Mount Vernon, TX   
110                                Jesus Ledesma      Mulberry, FL   
111                                 Turner Farms         Healy, KS   
112  B & R Harvesting and Paul Cruz (individual)  Collins, Georgia   
113                                  Delia Rojas    Lyons, Georgia   

                                             Violation  Duration Start date  \
0       Failure to respond to audit (partial res

## Make indicator for violation number


In [6]:

#where indicator takes value of viol for first row/potential violation 
#and viol2 if the second row/potential violation

debar["viol_number"] = debar.groupby("Name").cumcount() + 1
debar["viol_number"] = debar["viol_number"].apply(lambda viol: f"viol{viol}" if viol > 1 else "viol")


## Clean up state numbers


In [7]:
#read in state names from a complete name/abbreviation crosswalk
crosswalk_url = 'http://app02.clerk.org/menu/ccis/Help/CCIS%20Codes/state_codes.html'

#create a df with the state names and their code
state_code_names_df = pd.read_html(crosswalk_url)[0]

#change the City, State column datatype to string
debar["City, State"] = debar["City, State"].astype(str)

#Create city and state columns 
debar[["City", "State"]] = debar["City, State"].str.split(", ", expand = True)

#check if a state has more than one violation
grouped_by_state = debar["State"].value_counts()
potential_violations = grouped_by_state[grouped_by_state > 1]

In [6]:
#function to convert improper state names to state codes
def convert_to_state_code(state_name):
    if state_name is None:
        return None
    #clean the input string
    clean_state_name = re.sub(r'[^\w\s]', '', state_name).strip()

    
    if not state_code_names_df["Description"].str.contains(clean_state_name).any():
        return clean_state_name
    return state_code_names_df.loc[state_code_names_df["Description"].str.contains(clean_state_name, case=False), "Code"].iloc[0]
debar.head()
debar["City, State"] = debar["City"] + ", " + debar["State"].apply(convert_to_state_code)
debar.head()

print(debar["City, State"])



Unnamed: 0,Name,"City, State",Violation,Duration,Start date,End date,viol_number,City,State
0,J&J Harvesting,"Leads, ND",Failure to respond to audit (partial response),2 years,1/19/2014,1/18/2016,viol,Leads,ND
1,"Stahlman Apiaries, Inc","Selby, SD",Failure to respond to audit (partial response),1 year,2/19/2015,2/14/2016,viol,Selby,SD
2,Trust Nursery,"Pulaski, NY",Failure to respond to audit (partial response),1 year,3/21/2014,3/20/2015,viol,Pulaski,NY
3,Anton Fertilizer Inc.,"Dighton, KS",Failure to respond to audit (no response),2 years,3/30/2014,3/29/2016,viol,Dighton,KS
4,"Great Plains Fluid Service, Inc.","Greensburg, KS",Failure to respond to audit (no response),2 years,3/30/2014,3/29/2016,viol,Greensburg,KS


Unnamed: 0,Name,"City, State",Violation,Duration,Start date,End date,viol_number,City,State
0,J&J Harvesting,"Leads, ND",Failure to respond to audit (partial response),2 years,1/19/2014,1/18/2016,viol,Leads,ND
1,"Stahlman Apiaries, Inc","Selby, SD",Failure to respond to audit (partial response),1 year,2/19/2015,2/14/2016,viol,Selby,SD
2,Trust Nursery,"Pulaski, NY",Failure to respond to audit (partial response),1 year,3/21/2014,3/20/2015,viol,Pulaski,NY
3,Anton Fertilizer Inc.,"Dighton, KS",Failure to respond to audit (no response),2 years,3/30/2014,3/29/2016,viol,Dighton,KS
4,"Great Plains Fluid Service, Inc.","Greensburg, KS",Failure to respond to audit (no response),2 years,3/30/2014,3/29/2016,viol,Greensburg,KS


0             Leads, ND
1             Selby, SD
2           Pulaski, NY
3           Dighton, KS
4        Greensburg, KS
             ...       
109    Mount Vernon, TX
110        Mulberry, FL
111           Healy, KS
112         Collins, GA
113           Lyons, GA
Name: City, State, Length: 114, dtype: object


##  Create a new column is_repeated in debar, that shows us whether an employer is repeated > 1 times.


In [8]:

#create a new column is_repeated, that indicates whether an employer (Name) is repeated > 1 times
debar["is_repeated"] = debar.duplicated(subset="Name", keep=False)

#Print the rows where is_repeated == True
debar[debar["is_repeated"] == True] 

##Interpretation: Repeated rows indicate cases of multiple violations, or just represent duplicate offenses (double counted because of misspellings, etc). 

#fix case where Altheimer, AR was changed to Altheimer AK 
debar.at[17, "City, State"] = "Altheimer, AR"
#fix case where Brownsville, TX was changed to Brownsfield
debar.at[25, "City, State"] = "Brownsville, TX"

#create a new dataframe for the rows where is_repeated == True
mult_debar = debar[debar["is_repeated"] == True] 

#print mult_debar head and shape
mult_debar.head()
mult_debar.shape



Unnamed: 0,Name,"City, State",Violation,Duration,Start date,End date,viol_number,City,State,is_repeated
6,Annabella Land & Cattle,"Annabella, UT",Non Payment,1 year,5/9/2014,5/9/2015,viol,Annabella,UT,True
7,Autumn Hill Orchard,"Groton, MA",Failure to respond to audit (no response),2 years,7/6/2014,7/5/2016,viol,Groton,MA,True
8,"Caddo Creek Ranch, dba Paradise Ranch","Caddo, TX",Failure to respond to audit (partial response),2 years,7/20/2014,7/19/2016,viol,Caddo,TX,True
11,Loewen Harvesting LLC,"Brownsville, TX",Failure to respond to audit (partial response),1 year,8/20/2014,8/19/2015,viol,Brownsville,TX,True
12,Rollo Farm Labor Contractor,"Miami, FL",Failure to respond to audit (no response),2 years,8/23/2014,8/22/2016,viol,Miami,FL,True
14,Sharon Mathis,"Tifton, GA",Failure to respond to audit (no response),2 years,11/16/2014,11/15/2016,viol,Tifton,GA,True
15,SRT Farms,"Morton, TX",Failure to respond to audit (no response),2 years,11/16/2014,11/15/2016,viol,Morton,TX,True
16,Mark Duncan,"Roosevelt, UT",Failure to respond to audit (no response),2 years,11/16/2014,11/15/2016,viol,Roosevelt,UT,True
17,"Maple Ridge Custom Services, LLC","Altheimer, AK",Failure to respond to audit (partial response),2 years,11/16/2014,11/15/2016,viol,Altheimer,AK,True
18,F&W Farms,"Ingalls, KS",Failure to respond to audit (partial response),2 years,12/10/2014,12/9/2016,viol,Ingalls,KS,True


Unnamed: 0,Name,"City, State",Violation,Duration,Start date,End date,viol_number,City,State,is_repeated
6,Annabella Land & Cattle,"Annabella, UT",Non Payment,1 year,5/9/2014,5/9/2015,viol,Annabella,UT,True
7,Autumn Hill Orchard,"Groton, MA",Failure to respond to audit (no response),2 years,7/6/2014,7/5/2016,viol,Groton,MA,True
8,"Caddo Creek Ranch, dba Paradise Ranch","Caddo, TX",Failure to respond to audit (partial response),2 years,7/20/2014,7/19/2016,viol,Caddo,TX,True
11,Loewen Harvesting LLC,"Brownsville, TX",Failure to respond to audit (partial response),1 year,8/20/2014,8/19/2015,viol,Brownsville,TX,True
12,Rollo Farm Labor Contractor,"Miami, FL",Failure to respond to audit (no response),2 years,8/23/2014,8/22/2016,viol,Miami,FL,True


(32, 10)

## Reshape `mult_debar` to wide format to begin filtering out duplicates 



In [9]:
mult_debar_wide = pd.pivot(mult_debar, index = ["Name","City, State"], columns="viol_number", values = "Start date").reset_index()
mult_debar_wide.rename(columns = {"viol" : "start_date_viol1", "viol2" : "start_date_viol2"}, inplace = True)


## Filter out duplicates from original debar data 


In [10]:
mult_debar_wide["is_dup"] = mult_debar_wide["start_date_viol1"] == mult_debar_wide["start_date_viol2"]
mult_debar_wide

#get list of dupped names
dupped_names = mult_debar_wide.Name[mult_debar_wide.is_dup]

#create a df with only dupped names
dupped_df = debar[debar.Name.isin(dupped_names)].copy()

#keep only dupped names where violnum == viol
dupped_df_to_keep = dupped_df[dupped_df.viol_number == "viol"]

#create a df for all of the not dupped employers
not_dupped = debar[~debar.Name.isin(dupped_names)].copy()


type(not_dupped)
type(dupped_df_to_keep)

debar_clean = pd.concat([dupped_df_to_keep, not_dupped], axis=0)

#delete columns city, state and is_repeated from debar_clean
debar_clean = debar_clean.drop(labels  = ["City", "State", "is_repeated"], axis = 1)

debar_clean

# print the shape and # of unique employer names
debar_clean.shape
debar_clean["Name"].nunique()



viol_number,Name,"City, State",start_date_viol1,start_date_viol2,is_dup
0,Annabella Land & Cattle,"Annabella, UT",5/9/2014,,False
1,Annabella Land & Cattle,"Annabella, Utah",,5/9/2014,False
2,Autumn Hill Orchard,"Groton, MA",7/6/2014,7/6/2014,True
3,"Caddo Creek Ranch, dba Paradise Ranch","Caddo, TX",7/20/2014,,False
4,"Caddo Creek Ranch, dba Paradise Ranch","Caddo, Texas",,7/20/2014,False
5,Cisco Produce Inc.,"Cairo, GA",12/10/2014,12/10/2015,False
6,Dove Creek Farms,"Mount Vernon, TX",2/9/2018,2/9/2018,True
7,F&W Farms,"Ingalls, KS",12/10/2014,12/10/2014,True
8,Loewen Harvesting LLC,"Brownsville, TX",8/20/2014,8/20/2014,True
9,Macky and Brad Farms,"Plains, TX",2/13/2015,2/13/2015,True


pandas.core.frame.DataFrame

pandas.core.frame.DataFrame

Unnamed: 0,Name,"City, State",Violation,Duration,Start date,End date,viol_number
7,Autumn Hill Orchard,"Groton, MA",Failure to respond to audit (no response),2 years,7/6/2014,7/5/2016,viol
11,Loewen Harvesting LLC,"Brownsville, TX",Failure to respond to audit (partial response),1 year,8/20/2014,8/19/2015,viol
12,Rollo Farm Labor Contractor,"Miami, FL",Failure to respond to audit (no response),2 years,8/23/2014,8/22/2016,viol
14,Sharon Mathis,"Tifton, GA",Failure to respond to audit (no response),2 years,11/16/2014,11/15/2016,viol
15,SRT Farms,"Morton, TX",Failure to respond to audit (no response),2 years,11/16/2014,11/15/2016,viol
...,...,...,...,...,...,...,...
107,Walker Place,"Danville, IL",Failure to comply with the employer's obligati...,2 months,11/19/2019,1/26/2020,viol
108,County Fair Farm (company) and Andrew Williams...,"Jefferson, ME",WHD Debarment,3 years,3/8/2017,3/8/2020,viol
110,Jesus Ledesma,"Mulberry, FL",Failure to Respond to Audit Request,2 years,2/8/18,2/8/20,viol
112,B & R Harvesting and Paul Cruz (individual),"Collins, Georgia",WHD Debarment,3 years,4/9/17,4/9/20,viol


(103, 7)

98

## Load data on job postings 

In [12]:
jobs = pd.read_csv("jobs.csv", low_memory=False)

jobs.head
jobs.info
jobs.shape

<bound method NDFrame.head of              CASE_NUMBER                           CASE_STATUS  \
0     H-300-20199-721302      Determination Issued - Withdrawn   
1     H-300-20231-773906  Determination Issued - Certification   
2     H-300-20231-774123  Determination Issued - Certification   
3     H-300-20231-774151  Determination Issued - Certification   
4     H-300-20231-774508  Determination Issued - Certification   
...                  ...                                   ...   
2715  H-300-20351-963307  Determination Issued - Certification   
2716  H-300-20351-963399  Determination Issued - Certification   
2717  H-300-20351-964097  Determination Issued - Certification   
2718  H-300-20351-965435  Determination Issued - Certification   
2719  H-300-20352-967311  Determination Issued - Certification   

                RECEIVED_DATE            DECISION_DATE  \
0     2020-07-17 14:50:40.840  2020-10-01 00:00:00.000   
1     2020-08-20 10:38:15.620  2020-10-01 00:00:00.000   
2  

<bound method DataFrame.info of              CASE_NUMBER                           CASE_STATUS  \
0     H-300-20199-721302      Determination Issued - Withdrawn   
1     H-300-20231-773906  Determination Issued - Certification   
2     H-300-20231-774123  Determination Issued - Certification   
3     H-300-20231-774151  Determination Issued - Certification   
4     H-300-20231-774508  Determination Issued - Certification   
...                  ...                                   ...   
2715  H-300-20351-963307  Determination Issued - Certification   
2716  H-300-20351-963399  Determination Issued - Certification   
2717  H-300-20351-964097  Determination Issued - Certification   
2718  H-300-20351-965435  Determination Issued - Certification   
2719  H-300-20352-967311  Determination Issued - Certification   

                RECEIVED_DATE            DECISION_DATE  \
0     2020-07-17 14:50:40.840  2020-10-01 00:00:00.000   
1     2020-08-20 10:38:15.620  2020-10-01 00:00:00.000   
2

(2720, 138)

##  2.2 Try inner join on employer name  (2 points)

- Use the `EMPLOYER_NAME` field of the `jobs` dataset
- Use the `Name` field of the `debar_clean` dataset 

A. Use pd.merge with an inner join on those fields to see whether there are any exact matches. 

B. If there are exact matches, print the row(s) with exact matches



In [11]:
## your code here
merged_df = pd.merge(debar_clean, jobs, left_on = "Name", right_on = "EMPLOYER_NAME", how = "inner")

## Printing rows with exact matches 

print(merged_df[merged_df["EMPLOYER_NAME"] == merged_df["Name"]])

## Need to change this to debar_clean after finished with question 1

             Name  City, State                         Violation Duration  \
0  Rafael Barajas  Sebring, FL  Non-payment of certification fee   1 year   

  Start date   End date viol_number         CASE_NUMBER  \
0  9/23/2016  9/22/2017        viol  H-300-20287-876656   

                            CASE_STATUS            RECEIVED_DATE  ...  \
0  Determination Issued - Certification  2020-10-20 09:20:32.010  ...   

  ADDENDUM_B_HOUSING_ATTACHED TOTAL_HOUSING_RECORDS MEALS_PROVIDED  \
0                           Y                     3              Y   

  MEALS_CHARGED MEAL_REIMBURSEMENT_MINIMUM MEAL_REIMBURSEMENT_MAXIMUM  \
0         12.68                      12.68                       55.0   

  PHONE_TO_APPLY EMAIL_TO_APPLY               WEBSITE_TO_APPLY  \
0    18632732686            NaN  https://seasonaljobs.dol.gov/   

  TOTAL_ADDENDUM_A_RECORDS  
0                        7  

[1 rows x 145 columns]


## Targeted regex to convert employer_name and name to uppercase, clean up punctuation and clean up employer names

In [13]:
## turn employeer name and name to upper case

jobs["EMPLOYER_NAME_UC"] = [x.upper() for x in jobs["EMPLOYER_NAME"]]

debar_clean["Name_UC"] = [x.upper() for x in debar_clean["Name"]]


In [14]:
## assign the uppercase names back to the data

jobs["EMPLOYER_NAME"] = jobs["EMPLOYER_NAME_UC"]
debar_clean["Name"] = debar_clean["Name_UC"]


In [15]:
## regex pattern to clean company names 

regex = r"(INC|LLC|CO)\."


In [16]:
## clean the names

jobs["name_clean"] = [re.sub(regex, r"\1", x) for x in jobs["EMPLOYER_NAME"]]
debar_clean["name_clean"] =  [re.sub(regex, r"\1", x) for x in debar_clean["Name"]]



## Join jobs and debar again and clean the names again


In [17]:
## Conduct an inner join between `jobs` and `debar_clean` using the `name_clean` column
new_merge = pd.merge(jobs, debar_clean, on = "name_clean", how = "inner")
new_merge

#clean names again 

regex2 = r"(\.)|( LLP)"

debar_clean["name_clean_2"] = [re.sub(regex2, "", x) for x in debar_clean["name_clean"]]


Unnamed: 0,CASE_NUMBER,CASE_STATUS,RECEIVED_DATE,DECISION_DATE,TYPE_OF_EMPLOYER_APPLICATION,H2A_LABOR_CONTRACTOR,NATURE_OF_TEMPORARY_NEED,EMERGENCY_FILING,EMPLOYER_NAME,TRADE_NAME_DBA,...,EMPLOYER_NAME_UC,name_clean,Name,"City, State",Violation,Duration,Start date,End date,viol_number,Name_UC
0,H-300-20287-876656,Determination Issued - Certification,2020-10-20 09:20:32.010,2020-11-09 00:00:00.000,Individual Employer,Y,Seasonal,Y,RAFAEL BARAJAS,,...,RAFAEL BARAJAS,RAFAEL BARAJAS,RAFAEL BARAJAS,"Sebring, Florida",Non-payment of certification fee,1 year,9/23/2016,9/22/2017,viol,RAFAEL BARAJAS


# Use regex to separate companies from individuals 



In [20]:
#regex pattern that
#Captures the pattern that occurs before COMPANY if (COMPANY) is in string
#Captures the pattern that occurs before INDIVIDUAL if (INDIVIDUAL) is also in string
regex_co_ind_pattern = r"(.*)\s\(COMPANY\)\sAND\s(.*)\s\(INDIVIDUAL\)\*"


In [21]:
# your code here
#create co_name and ind_name columns and set them to name_clean 
debar_clean["co_name"] = debar_clean["name_clean"]
debar_clean["ind_name"] = debar_clean["name_clean"]


#Iterate over the name_clean column in debar and use regex to create two new columns in debar_clean
for name in debar_clean["name_clean"]:
    #check if regex pattern returns a result 
    results = re.findall(regex_co_ind_pattern, name)
    if len(results) > 0 :
        #set the co_name column to the proper regex return value
        debar_clean.loc[debar_clean["name_clean"] == name, "co_name"] = results[0][0]
        #set the ind_name column to the proper regex return value
        debar_clean.loc[debar_clean["name_clean"] == name, "ind_name"] = results[0][1]
 