In [1232]:
import pandas as pd
import numpy as np

# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer
# from sklearn.impute import RandomForestRegressor

# Data Imputation Methods

Considering the number of missing values found within the dataset a variety of imputation methods were considered to take care of data sparsity. To address this, key features of the data set were considered including the fact that feetures are highly correlated in addition to whether the data is missing at random, missing completley at random, or missing not at random. 

Missing Completely at Random, MCAR, means there is no relationship between the missingness of the data and any values, observed or missing. Those missing data points are a random subset of the data. There is nothing systematic going on that makes some data more likely to be missing than others.

Missing at Random, MAR, means there is a systematic relationship between the propensity of missing values and the observed data, but not the missing data. Whether an observation is missing has nothing to do with the missing values, but it does have to do with the values of an individual’s observed variables. So, for example, if men are more likely to tell you their weight than women, weight is MAR.

Missing Not at Random, MNAR, means there is a relationship between the propensity of a value to be missing and its values. This is a case where the people with the lowest education are missing on education or the sickest people are most likely to drop out of the study.

## Quick Note:
To get more accurate results Text Data must be Cleaned First by dealing with Typos in Text Data (Counties).

## Identifying Features with missing data and conduct exploratory analysis in relation to the missing data to identify how to address data sparsity

In [1233]:
dfdict = pd.read_excel("../../data/DataDictionary.xlsx")

#rename field
dfdict= dfdict.rename(columns={'Field Type': 'field_type','Field Name':'Feature'})
dfdict.field_type = dfdict.field_type.astype(str)

#remove field not in novice
dfdict=dfdict.drop(index=[1,2]).reset_index(drop=True)
#dfdict.reset_index(drop=True)
dfdict.head()

Unnamed: 0,Field ID,Feature,Short Description,Long Description,field_type,Additional Notes
0,1,Match ID 18Char,Unique ID for each Big/Little Match,A unique identifier for each Big/Little match,ID,
1,4,Stage,Match status,Match status as of 2025-02-27,Categorical,This field deleted from test set
2,5,Little ID,Little's Unique ID,A unique identifier for each Little,ID,
3,6,Big ID,Big's Unique ID,A unique identifier for each Big,ID,
4,7,Big County,County of the Big,County of the Big at time of Match,Text,


In [1234]:

ogdf = pd.read_excel("../../data/Novice.xlsx") #load novice data

df=ogdf.copy()

In [1235]:
df.isna().sum() #count number of NaN values per column

Match ID 18Char                                 0
Stage                                           0
Little ID                                       0
Big ID                                          0
Big County                                    655
                                             ... 
Little Birthdate                                0
Little Mailing Address Census Block Group     966
Big Home Census Block Group                  1005
Big Employer/School Census Block Group       3158
Match Length                                    0
Length: 66, dtype: int64

In [1236]:
df.nunique() #count the number of unique values

Match ID 18Char                              3275
Stage                                           3
Little ID                                    2998
Big ID                                       3080
Big County                                     54
                                             ... 
Little Birthdate                              165
Little Mailing Address Census Block Group     959
Big Home Census Block Group                  1163
Big Employer/School Census Block Group         91
Match Length                                  701
Length: 66, dtype: int64

In [1237]:
#Calculate the percent of missing values
percent_missing = df.isnull().sum() * 100 / len(df)

percent_missing =pd.DataFrame({'Feature':percent_missing.index,'prcnt_missing':percent_missing})
percent_missing.reset_index(drop=True)

Unnamed: 0,Feature,prcnt_missing
0,Match ID 18Char,0.000000
1,Stage,0.000000
2,Little ID,0.000000
3,Big ID,0.000000
4,Big County,20.000000
...,...,...
61,Little Birthdate,0.000000
62,Little Mailing Address Census Block Group,29.496183
63,Big Home Census Block Group,30.687023
64,Big Employer/School Census Block Group,96.427481


In [1238]:
percent_missing = pd.merge(percent_missing, dfdict,on='Feature')
percent_missing =percent_missing.drop(columns=['Field ID','Short Description','Long Description','Additional Notes'])
percent_missing

Unnamed: 0,Feature,prcnt_missing,field_type
0,Match ID 18Char,0.000000,ID
1,Stage,0.000000,Categorical
2,Little ID,0.000000,ID
3,Big ID,0.000000,ID
4,Big County,20.000000,Text
...,...,...,...
60,Little Birthdate,0.000000,Date
61,Little Mailing Address Census Block Group,29.496183,ID
62,Big Home Census Block Group,30.687023,ID
63,Big Employer/School Census Block Group,96.427481,ID


In [1239]:
#non-missing features
percent_missing[percent_missing.prcnt_missing==0]

Unnamed: 0,Feature,prcnt_missing,field_type
0,Match ID 18Char,0.0,ID
1,Stage,0.0,Categorical
2,Little ID,0.0,ID
3,Big ID,0.0,ID
5,Big Age,0.0,Numerical
11,Big Birthdate,0.0,Date
13,Program,0.0,Text
14,Program Type,0.0,Categorical
18,Match Activation Date,0.0,Date
60,Little Birthdate,0.0,Date


In [1240]:

#Identify number of features with Missing Data
missing=percent_missing[percent_missing.prcnt_missing!=0].reset_index(drop=True)
missing.head()

Unnamed: 0,Feature,prcnt_missing,field_type
0,Big County,20.0,Text
1,Big Occupation,9.923664,Categorical
2,Big Approved Date,8.885496,Categorical
3,Big Level of Education,86.259542,Categorical
4,Big Languages,46.167939,"Categorical, List"


In [1241]:
#report the number of unique values for each missing feature and sort from lowest to highest percent missing
nuniq=[]
nmiss=[]
for i in missing.Feature:
    nuniq.append(df[i].nunique())  
    nmiss.append(df[i].isna().sum())
missing['nuniq']=nuniq
missing['nmissing']=nmiss
missing.sort_values(by=['prcnt_missing'],ascending=False)

Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
23,Big Contact: Interest Finder - Sports,99.175573,"Categorical, List",23,3248
26,Big Contact: Interest Finder - Entertainment,99.175573,"Categorical, List",20,3248
24,Big Contact: Interest Finder - Places To Go,99.175573,"Categorical, List",11,3248
25,Big Contact: Interest Finder - Hobbies,99.175573,"Categorical, List",22,3248
44,Little Contact: Interest Finder - Other Interests,99.022901,Text,20,3243
19,Big Open to Cross-Gender Match,98.870229,Categorical,2,3238
35,Little Contact: Language(s) Spoken,98.839695,"Categorical, List",3,3237
22,Big Contact: Former Big/Little,98.045802,Categorical,2,3211
48,Little Contact: Interest Finder - Three Wishes,97.801527,Text,41,3203
46,Little Contact: Interest Finder - Career,97.801527,Text,41,3203


In [1242]:
table = pd.pivot_table(missing, values=['nmissing'], index=['field_type'], aggfunc="sum")
table

Unnamed: 0_level_0,nmissing
field_type,Unnamed: 1_level_1
Categorical,22747
"Categorical, List",43243
Date,22705
ID,5129
Numerical,6725
Text,15504


In [1243]:
table = pd.pivot_table(missing, values=['prcnt_missing'], index=['field_type'], aggfunc="median")
table

Unnamed: 0_level_0,prcnt_missing
field_type,Unnamed: 1_level_1
Categorical,58.961832
"Categorical, List",97.557252
Date,58.137405
ID,30.687023
Numerical,73.618321
Text,66.412214


In [1244]:
### Data Imputation across Different Data Types

##### Text Data: (Not Specified)
1. County -> use info on census group
2. Employer -> Not specified?
3. Closure Details/Closure Reason -> Not Specified
4. Rationale for Match -> Not Specified
5. Combine last four in little interest column

In [1245]:
missing[missing["field_type"] == 'Text']

Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
0,Big County,20.0,Text,54,655
6,Big Employer,16.824427,Text,1624,551
9,Closure Details,42.473282,Text,1756,1391
11,Rationale for Match,9.129771,Text,2803,299
44,Little Contact: Interest Finder - Other Interests,99.022901,Text,20,3243
45,Little Other Interests,90.351145,Text,276,2959
46,Little Contact: Interest Finder - Career,97.801527,Text,41,3203
48,Little Contact: Interest Finder - Three Wishes,97.801527,Text,41,3203


##### Categorical
1. Impute all except (Re-Enroll=0)

In [1246]:
missing[missing["field_type"] == 'Categorical']

Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
1,Big Occupation,9.923664,Categorical,87,325
2,Big Approved Date,8.885496,Categorical,1238,291
3,Big Level of Education,86.259542,Categorical,9,2825
5,Big Gender,0.030534,Categorical,6,1
8,Closure Reason,24.0,Categorical,33,786
12,Big Enrollment: Record Type,58.076336,Categorical,5,1902
15,Big Car Access,96.427481,Categorical,3,3158
19,Big Open to Cross-Gender Match,98.870229,Categorical,2,3238
20,Big Re-Enroll,58.076336,Categorical,2,1902
21,Big Contact: Preferred Communication Type,96.122137,Categorical,3,3148


##### Date
1. Remove redundant date (Little RTBM Date in MF)

In [1247]:
missing[missing["field_type"] == 'Date']

Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
10,Match Closure Meeting Date,77.679389,Date,451,2544
13,Big Assessment Uploaded,58.10687,Date,816,1903
14,Big Acceptance Date,58.10687,Date,712,1903
27,Big Contact: Created Date,58.076336,Date,860,1902
28,Big Enrollment: Created Date,58.076336,Date,777,1902
31,Little RTBM Date in MF,92.427481,Date,112,3027
32,Little RTBM in Matchforce,58.137405,Date,2,1904
33,Little Moved to RTBM in MF,58.137405,Date,1,1904
34,Little Application Received,58.137405,Date,766,1904
36,Little Interview Date,58.229008,Date,731,1907


##### ID
1. Impute

In [1248]:
missing[missing["field_type"] == 'ID']

Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
51,Little Mailing Address Census Block Group,29.496183,ID,959,966
52,Big Home Census Block Group,30.687023,ID,1163,1005
53,Big Employer/School Census Block Group,96.427481,ID,91,3158


##### Categorical List
1. Combine interests into column
2. use both language and ethnicity to impute each other
3. impute little race/ethnicity from little contaxt languages spoken 
4. impute gender

In [1249]:
missing[missing["field_type"] == 'Categorical, List']

Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
4,Big Languages,46.167939,"Categorical, List",29,1512
7,Big Race/Ethnicity,1.007634,"Categorical, List",20,33
23,Big Contact: Interest Finder - Sports,99.175573,"Categorical, List",23,3248
24,Big Contact: Interest Finder - Places To Go,99.175573,"Categorical, List",11,3248
25,Big Contact: Interest Finder - Hobbies,99.175573,"Categorical, List",22,3248
26,Big Contact: Interest Finder - Entertainment,99.175573,"Categorical, List",20,3248
29,Big Contact: Volunteer Availability,91.541985,"Categorical, List",13,2998
35,Little Contact: Language(s) Spoken,98.839695,"Categorical, List",3,3237
38,Little Contact: Interest Finder - Sports,97.557252,"Categorical, List",46,3195
39,Little Contact: Interest Finder - Outdoors,97.557252,"Categorical, List",41,3195


##### Numerical

In [1250]:
missing[missing["field_type"] == 'Numerical']

Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
16,Big Days Acceptance to Match,73.618321,Numerical,266,2411
17,Big Days Interview to Acceptance,58.10687,Numerical,173,1903
18,Big Days Interview to Match,73.618321,Numerical,306,2411


## Addressing Missing Data

### STEPS IN DEALING WITH MISSING DATA
1. Non-Imputation Methods:
- Receode Missingness: Combine Big and Little Contact Interest Finder into one large interest list observation  - Accounts for 15 features with missing data
- Check if any missing values can be computed through other values (i.e. county from census code)
-  DROP ANY WITH MORE THAN 90% Missing
- Label missing Text data as missing
2. Imputational Methods:
- Convert Data to Categorical or Numerical
- Perform MICE
- plot distributions of imputed data to assess effectiveness

#### 1. Non-Imputation Methods
This part primarily deals with text data

In [1251]:
#Combine Big and Little Contact Interest Finder into one large list observation


#combine big interests
df['big_interest']=(df["Big Contact: Interest Finder - Entertainment"].fillna('') +df["Big Contact: Interest Finder - Hobbies"].fillna('')
                    +df["Big Contact: Interest Finder - Places To Go"].fillna('') + df["Big Contact: Interest Finder - Sports"].fillna(''))

#combine little interests
df["little_interest"] = (df["Little Contact: Interest Finder - Arts"].fillna('') + df["Little Contact: Interest Finder - Career"].fillna('')
                    + df["Little Contact: Interest Finder - Entertainment"].fillna('') + df["Little Contact: Interest Finder - Hobbies"].fillna('')
                    + df["Little Contact: Interest Finder - Other Interests"].fillna('') + df["Little Contact: Interest Finder - Outdoors"].fillna('')
                    + df["Little Contact: Interest Finder - Personality"].fillna('') +  df["Little Contact: Interest Finder - Places To Go"].fillna('')
                    + df["Little Contact: Interest Finder - Sports"].fillna('') + df["Little Contact: Interest Finder - Three Wishes"].fillna('')
                    + df["Little Other Interests"].fillna(''))

#drop interest columns
df=df.drop(columns=["Big Contact: Interest Finder - Entertainment","Big Contact: Interest Finder - Hobbies","Big Contact: Interest Finder - Places To Go",
                    "Big Contact: Interest Finder - Sports","Little Contact: Interest Finder - Arts","Little Contact: Interest Finder - Career",
                    "Little Contact: Interest Finder - Entertainment", "Little Contact: Interest Finder - Hobbies", "Little Contact: Interest Finder - Other Interests",
                    "Little Contact: Interest Finder - Outdoors","Little Contact: Interest Finder - Personality","Little Contact: Interest Finder - Places To Go",
                    "Little Contact: Interest Finder - Sports","Little Contact: Interest Finder - Three Wishes","Little Other Interests"])

df['big_interest'] = df['big_interest'].replace('', 'Not Specified')
df['little_interest'] = df['little_interest'].replace('', 'Not Specified')


#dealing with missing text data: by indicating Not Specified
df["Big Employer"] = df['Big Employer'].fillna('Not Specified')
df["Closure Details"] = df['Closure Details'].fillna('Not Specified')
df["Rationale for Match"] = df['Rationale for Match'].fillna('Not Specified')

df.head()


Unnamed: 0,Match ID 18Char,Stage,Little ID,Big ID,Big County,Big Age,Big Occupation,Big: Military,Big Approved Date,Big Level of Education,...,Little Acceptance Date,Little Gender,Little Participant: Race/Ethnicity,Little Birthdate,Little Mailing Address Census Block Group,Big Home Census Block Group,Big Employer/School Census Block Group,Match Length,big_interest,little_interest
0,a1v2J0000028pRvQAI,Closed,0032J00003PLe29QAD,0032J00003PhDOI,Hennepin,40,Unemployed,,NaT,Masters Degree,...,NaT,Female,Black or African American,2004-01-01,270530300000.0,270530200000.0,,9.0,Not Specified,Not Specified
1,a1v2J000002uR0JQAU,Closed,0032J00003PfZ6OQAV,0032J00003PgoV1,Washington,65,Tech: Research/Design,,2018-04-11,,...,NaT,Female,Black or African American; White or Caucasian,2006-06-01,271630700000.0,271630700000.0,,46.1,Not Specified,Not Specified
2,a1v2J0000027NsOQAU,Closed,0032J00003PLeoRQAT,0032J00003Ph0MT,Ramsey,45,Military,,NaT,Bachelors Degree,...,NaT,Male,Black or African American; White or Caucasian,2007-01-01,270030500000.0,271230400000.0,,6.2,Not Specified,Not Specified
3,a1v2J0000027dtOQAQ,Active,0032J00003PLeoRQAT,0032J00003Ph14N,Hennepin,61,Finance: Banking,,2018-01-02,,...,NaT,Male,Black or African American; White or Caucasian,2007-01-01,270030500000.0,,,85.6,Not Specified,Not Specified
4,a1v2J0000028enKQAQ,Closed,0032J00003PfZ6QQAV,0032J00003Ph14j,Ramsey,29,Human Services: Non-Profit,,2018-03-12,Bachelors Degree,...,NaT,Female,Hispanic,2005-01-01,271630700000.0,270531100000.0,,28.3,Not Specified,Not Specified


In [1252]:
#reassess percent missing
print((df['big_interest'] == 'Not Specified').sum()* 100 / len(df))
print((df['little_interest'] == 'Not Specified').sum()* 100 / len(df))

99.17557251908397
88.61068702290076


In [1253]:
# Drop columns with more than 90% missing
df=df.drop(columns=['Big Car Access','Big Contact: Preferred Communication Type', 'Big Contact: Volunteer Availability','Big Employer/School Census Block Group','Little RTBM Date in MF'])
df.head()

Unnamed: 0,Match ID 18Char,Stage,Little ID,Big ID,Big County,Big Age,Big Occupation,Big: Military,Big Approved Date,Big Level of Education,...,Little Interview Date,Little Acceptance Date,Little Gender,Little Participant: Race/Ethnicity,Little Birthdate,Little Mailing Address Census Block Group,Big Home Census Block Group,Match Length,big_interest,little_interest
0,a1v2J0000028pRvQAI,Closed,0032J00003PLe29QAD,0032J00003PhDOI,Hennepin,40,Unemployed,,NaT,Masters Degree,...,NaT,NaT,Female,Black or African American,2004-01-01,270530300000.0,270530200000.0,9.0,Not Specified,Not Specified
1,a1v2J000002uR0JQAU,Closed,0032J00003PfZ6OQAV,0032J00003PgoV1,Washington,65,Tech: Research/Design,,2018-04-11,,...,NaT,NaT,Female,Black or African American; White or Caucasian,2006-06-01,271630700000.0,271630700000.0,46.1,Not Specified,Not Specified
2,a1v2J0000027NsOQAU,Closed,0032J00003PLeoRQAT,0032J00003Ph0MT,Ramsey,45,Military,,NaT,Bachelors Degree,...,NaT,NaT,Male,Black or African American; White or Caucasian,2007-01-01,270030500000.0,271230400000.0,6.2,Not Specified,Not Specified
3,a1v2J0000027dtOQAQ,Active,0032J00003PLeoRQAT,0032J00003Ph14N,Hennepin,61,Finance: Banking,,2018-01-02,,...,NaT,NaT,Male,Black or African American; White or Caucasian,2007-01-01,270030500000.0,,85.6,Not Specified,Not Specified
4,a1v2J0000028enKQAQ,Closed,0032J00003PfZ6QQAV,0032J00003Ph14j,Ramsey,29,Human Services: Non-Profit,,2018-03-12,Bachelors Degree,...,NaT,NaT,Female,Hispanic,2005-01-01,271630700000.0,270531100000.0,28.3,Not Specified,Not Specified


In [1254]:
# Dealing with County: compute county from census code
#look at counts across different countys
count=df["Big County"].value_counts()
count

Big County
Hennepin                    1485
Ramsey                       592
Dakota                       157
Anoka                        139
Washington                    95
Scott                         42
Carver                        27
Wright                        14
Chisago                        8
Isanti                         5
Other                          4
Middlesex                      3
Waukesha                       2
Olmsted                        2
New Castle County              2
Pierce                         2
Sherburne                      2
Dane                           2
St. Croix                      2
Story                          1
King                           1
Okanogan                       1
McLeod                         1
Ozaukee                        1
Lake                           1
Outagamie                      1
Brown                          1
LeSeuer                        1
Hudson                         1
Fond du Lac                    1

In [1255]:
count[count==1].index

Index(['Story', 'King', 'Okanogan', 'McLeod', 'Ozaukee', 'Lake', 'Outagamie',
       'Brown', 'LeSeuer', 'Hudson', 'Fond du Lac', 'Freeborn', 'Sawyer',
       'St. Croix County', 'Stearns', 'Marathon', 'Lincoln', 'Polk',
       'Testing County', 'California', 'St. Croix County, WI', 'Fulton', 'mn',
       'US', 'United States', 'United States of America', 'Outside state',
       'Rice', 'Hennepin County', 'Henepin', 'MN', 'Ramesy', 'Hennpin',
       'Sarpy', 'Rice County'],
      dtype='object', name='Big County')

In [1256]:
#remove counties that are invalid
# invalid_counties=['California','Testing County','mn','US','United States','United States of America','Outside state','MN']

In [1257]:
# Dealing with County: compute county from census code


# for i in df['Big Home Census Block Group']:
#     if df['Big County'][i].isna():
#         if '27053' in  df['Big Home Census Block Group'][i]:
#             df['Big County'][i]='____'
#         elif '_____' in  df['Big Home Census Block Group'][i]:
#             df['Big County'][i]='___'
#         elif '____' in  df['Big Home Census Block Group'][i]:
#             df['Big County'][i]='____'

    

#### 2. Imputational Methods
This part deals with categorical, numerical, and dates

In [1258]:
#MISSING DATE DATA

# #Convert every Date into a numerical value which denotes a count from a specific start date

Date_columns=['Match Closure Meeting Date','Big Assessment Uploaded','Big Acceptance Date','Big Contact: Created Date', 'Little RTBM in Matchforce','Little Moved to RTBM in MF',
              'Little Application Received','Little Interview Date','Little Acceptance Date']

#1. convert them all to date objects
df['Match Closure Meeting Date'] = pd.to_datetime(df['Match Closure Meeting Date'])

df['Big Assessment Uploaded'] = pd.to_datetime(df['Big Assessment Uploaded'])
df['Big Acceptance Date'] = pd.to_datetime(df['Big Acceptance Date'])
df['Big Contact: Created Date'] = pd.to_datetime(df['Big Contact: Created Date'])

df['Little RTBM in Matchforce'] = pd.to_datetime(df['Little RTBM in Matchforce'])
df['Little Moved to RTBM in MF'] = pd.to_datetime(df['Little Moved to RTBM in MF'])
df['Little Application Received'] = pd.to_datetime(df['Little Application Received'])
df['Little Interview Date'] = pd.to_datetime(df['Little Interview Date'])
df['Little Acceptance Date'] = pd.to_datetime(df['Little Acceptance Date'])

# Identify a Reference date (you can change this to any date)
reference_date = pd.to_datetime('2017-01-01')

# Calculate the number of days since the reference date
df['R Match Closure Meeting Date'] = (df['Match Closure Meeting Date'] - reference_date).dt.days

df['R Big Assessment Uploaded'] = (df['Big Assessment Uploaded'] - reference_date).dt.days
df['R Big Acceptance Date'] = (df['Big Acceptance Date'] - reference_date).dt.days
df['R Big Contact: Created Date'] = (df['Big Contact: Created Date'] - reference_date).dt.days

df['R Little RTBM in Matchforce'] = (df['Little RTBM in Matchforce'] - reference_date).dt.days
df['R Little Moved to RTBM in MF'] = (df['Little Moved to RTBM in MF'] - reference_date).dt.days
df['R Little Application Received'] = (df['Little Application Received'] - reference_date).dt.days
df['R Little Interview Date'] = (df['Little Interview Date'] - reference_date).dt.days
df['R Little Acceptance Date'] = (df['Little Acceptance Date'] - reference_date).dt.days

#Remove Date Columns from df

df=df.drop(columns=Date_columns)


In [1259]:
#NON_MISSING DATE DATA

#NON-MISSINGDATES
# #Convert every Date into a numerical value which denotes a count from a specific start date

notMissing_Date_columns=['Big Birthdate','Match Activation Date','Little Birthdate']

#1. convert them all to date objects
df['Big Birthdate'] = pd.to_datetime(df['Big Birthdate'])
df['Little Birthdate'] = pd.to_datetime(df['Little Birthdate'])
df['Match Activation Date'] = pd.to_datetime(df['Match Activation Date'])

# Calculate the number of days since the reference date
df['R Big Birthdate'] = (df['Big Birthdate'] - reference_date).dt.days
df['R Match Activation Date'] = (df['Match Activation Date'] - reference_date).dt.days
df['R Match Activation Date'] = (df['Match Activation Date'] - reference_date).dt.days
#Remove Date Columns from df

df=df.drop(columns=notMissing_Date_columns)


In [1260]:
#Remove Little,Big, and Match ID

df=df.drop(columns=['Match ID 18Char','Little ID','Big ID'])

#Remove Text Data

# Program

text_cols=['Big Employer','Closure Details', 'Rationale for Match','big_interest','little_interest']

df=df.drop(columns=text_cols)

#Convert Categorical Data into Numerical data
# Big County

# cat_cols=[]
# df=df.get_dummies(df,columns=[])

In [1261]:
#Plot Heat Map (MICE works best for highly correlated predictors)

In [1262]:
# Conduct MICE on the dataset 
# missing_mask = df.isna()
# imputer = IterativeImputer(max_iter=10, random_state=0)
# imputed_values = imputer.fit_transform(df)
# df[missing_mask] = imputed_values[missing_mask]

# Merge Imputed Novice Data Set with Training set and create Imputed_Train