In [1135]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer
# from sklearn.impute import RandomForestRegressor

# Data Imputation Methods

Considering the number of missing values found within the dataset a variety of imputation methods were considered to take care of data sparsity. To address this, key features of the data set were considered including the fact that feetures are highly correlated in addition to whether the data is missing at random, missing completley at random, or missing not at random. 

Missing Completely at Random, MCAR, means there is no relationship between the missingness of the data and any values, observed or missing. Those missing data points are a random subset of the data. There is nothing systematic going on that makes some data more likely to be missing than others.

Missing at Random, MAR, means there is a systematic relationship between the propensity of missing values and the observed data, but not the missing data. Whether an observation is missing has nothing to do with the missing values, but it does have to do with the values of an individual’s observed variables. So, for example, if men are more likely to tell you their weight than women, weight is MAR.

Missing Not at Random, MNAR, means there is a relationship between the propensity of a value to be missing and its values. This is a case where the people with the lowest education are missing on education or the sickest people are most likely to drop out of the study.

## Quick Note:
To get more accurate results Text Data must be Cleaned First by dealing with Typos in Text Data (Counties).

## Identifying Features with missing data and conduct exploratory analysis in relation to the missing data to identify how to address data sparsity

In [1136]:
dfdict = pd.read_excel("../../data/DataDictionary.xlsx")

#rename field
dfdict= dfdict.rename(columns={'Field Type': 'field_type','Field Name':'Feature'})
dfdict.field_type = dfdict.field_type.astype(str)

#remove field not in novice
dfdict=dfdict.drop(index=[1,2]).reset_index(drop=True)
#dfdict.reset_index(drop=True)
dfdict.head()

Unnamed: 0,Field ID,Feature,Short Description,Long Description,field_type,Additional Notes
0,1,Match ID 18Char,Unique ID for each Big/Little Match,A unique identifier for each Big/Little match,ID,
1,4,Stage,Match status,Match status as of 2025-02-27,Categorical,This field deleted from test set
2,5,Little ID,Little's Unique ID,A unique identifier for each Little,ID,
3,6,Big ID,Big's Unique ID,A unique identifier for each Big,ID,
4,7,Big County,County of the Big,County of the Big at time of Match,Text,


In [1137]:

ogdf = pd.read_excel("../../data/Novice.xlsx") #load novice data

df=ogdf.copy()

In [1138]:
df.isna().sum() #count number of NaN values per column

Match ID 18Char                                 0
Stage                                           0
Little ID                                       0
Big ID                                          0
Big County                                    655
                                             ... 
Little Birthdate                                0
Little Mailing Address Census Block Group     966
Big Home Census Block Group                  1005
Big Employer/School Census Block Group       3158
Match Length                                    0
Length: 66, dtype: int64

In [1139]:
df.nunique() #count the number of unique values

Match ID 18Char                              3275
Stage                                           3
Little ID                                    2998
Big ID                                       3080
Big County                                     54
                                             ... 
Little Birthdate                              165
Little Mailing Address Census Block Group     959
Big Home Census Block Group                  1163
Big Employer/School Census Block Group         91
Match Length                                  701
Length: 66, dtype: int64

In [1140]:
#Calculate the percent of missing values
percent_missing = df.isnull().sum() * 100 / len(df)

percent_missing =pd.DataFrame({'Feature':percent_missing.index,'prcnt_missing':percent_missing})
percent_missing.reset_index(drop=True)

Unnamed: 0,Feature,prcnt_missing
0,Match ID 18Char,0.000000
1,Stage,0.000000
2,Little ID,0.000000
3,Big ID,0.000000
4,Big County,20.000000
...,...,...
61,Little Birthdate,0.000000
62,Little Mailing Address Census Block Group,29.496183
63,Big Home Census Block Group,30.687023
64,Big Employer/School Census Block Group,96.427481


In [1141]:
percent_missing = pd.merge(percent_missing, dfdict,on='Feature')
percent_missing =percent_missing.drop(columns=['Field ID','Short Description','Long Description','Additional Notes'])
percent_missing

Unnamed: 0,Feature,prcnt_missing,field_type
0,Match ID 18Char,0.000000,ID
1,Stage,0.000000,Categorical
2,Little ID,0.000000,ID
3,Big ID,0.000000,ID
4,Big County,20.000000,Text
...,...,...,...
60,Little Birthdate,0.000000,Date
61,Little Mailing Address Census Block Group,29.496183,ID
62,Big Home Census Block Group,30.687023,ID
63,Big Employer/School Census Block Group,96.427481,ID


In [1142]:
#non-missing features
percent_missing[percent_missing.prcnt_missing==0]

Unnamed: 0,Feature,prcnt_missing,field_type
0,Match ID 18Char,0.0,ID
1,Stage,0.0,Categorical
2,Little ID,0.0,ID
3,Big ID,0.0,ID
5,Big Age,0.0,Numerical
11,Big Birthdate,0.0,Date
13,Program,0.0,Text
14,Program Type,0.0,Categorical
18,Match Activation Date,0.0,Date
60,Little Birthdate,0.0,Date


In [1143]:

#Identify number of features with Missing Data
missing=percent_missing[percent_missing.prcnt_missing!=0].reset_index(drop=True)
missing.head()

Unnamed: 0,Feature,prcnt_missing,field_type
0,Big County,20.0,Text
1,Big Occupation,9.923664,Categorical
2,Big Approved Date,8.885496,Categorical
3,Big Level of Education,86.259542,Categorical
4,Big Languages,46.167939,"Categorical, List"


In [1144]:
#report the number of unique values for each missing feature and sort from lowest to highest percent missing
nuniq=[]
nmiss=[]
for i in missing.Feature:
    nuniq.append(df[i].nunique())  
    nmiss.append(df[i].isna().sum())
missing['nuniq']=nuniq
missing['nmissing']=nmiss
missing.sort_values(by=['prcnt_missing'],ascending=False)

Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
23,Big Contact: Interest Finder - Sports,99.175573,"Categorical, List",23,3248
26,Big Contact: Interest Finder - Entertainment,99.175573,"Categorical, List",20,3248
24,Big Contact: Interest Finder - Places To Go,99.175573,"Categorical, List",11,3248
25,Big Contact: Interest Finder - Hobbies,99.175573,"Categorical, List",22,3248
44,Little Contact: Interest Finder - Other Interests,99.022901,Text,20,3243
19,Big Open to Cross-Gender Match,98.870229,Categorical,2,3238
35,Little Contact: Language(s) Spoken,98.839695,"Categorical, List",3,3237
22,Big Contact: Former Big/Little,98.045802,Categorical,2,3211
48,Little Contact: Interest Finder - Three Wishes,97.801527,Text,41,3203
46,Little Contact: Interest Finder - Career,97.801527,Text,41,3203


In [1145]:
table = pd.pivot_table(missing, values=['nmissing'], index=['field_type'], aggfunc="sum")
table

Unnamed: 0_level_0,nmissing
field_type,Unnamed: 1_level_1
Categorical,22747
"Categorical, List",43243
Date,22705
ID,5129
Numerical,6725
Text,15504


In [1146]:
table = pd.pivot_table(missing, values=['prcnt_missing'], index=['field_type'], aggfunc="median")
table

Unnamed: 0_level_0,prcnt_missing
field_type,Unnamed: 1_level_1
Categorical,58.961832
"Categorical, List",97.557252
Date,58.137405
ID,30.687023
Numerical,73.618321
Text,66.412214


In [1147]:
### Data Imputation across Different Data Types

##### Text Data: (Not Specified)
1. County -> use info on census group
2. Employer -> Not specified?
3. Closure Details/Closure Reason -> Not Specified
4. Rationale for Match -> Not Specified
5. Combine last four in little interest column

In [1148]:
missing[missing["field_type"] == 'Text']

Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
0,Big County,20.0,Text,54,655
6,Big Employer,16.824427,Text,1624,551
9,Closure Details,42.473282,Text,1756,1391
11,Rationale for Match,9.129771,Text,2803,299
44,Little Contact: Interest Finder - Other Interests,99.022901,Text,20,3243
45,Little Other Interests,90.351145,Text,276,2959
46,Little Contact: Interest Finder - Career,97.801527,Text,41,3203
48,Little Contact: Interest Finder - Three Wishes,97.801527,Text,41,3203


##### Categorical

In [1149]:
missing[missing["field_type"] == 'Categorical']


Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
1,Big Occupation,9.923664,Categorical,87,325
2,Big Approved Date,8.885496,Categorical,1238,291
3,Big Level of Education,86.259542,Categorical,9,2825
5,Big Gender,0.030534,Categorical,6,1
8,Closure Reason,24.0,Categorical,33,786
12,Big Enrollment: Record Type,58.076336,Categorical,5,1902
15,Big Car Access,96.427481,Categorical,3,3158
19,Big Open to Cross-Gender Match,98.870229,Categorical,2,3238
20,Big Re-Enroll,58.076336,Categorical,2,1902
21,Big Contact: Preferred Communication Type,96.122137,Categorical,3,3148


##### Date

In [1150]:
missing[missing["field_type"] == 'Date']

Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
10,Match Closure Meeting Date,77.679389,Date,451,2544
13,Big Assessment Uploaded,58.10687,Date,816,1903
14,Big Acceptance Date,58.10687,Date,712,1903
27,Big Contact: Created Date,58.076336,Date,860,1902
28,Big Enrollment: Created Date,58.076336,Date,777,1902
31,Little RTBM Date in MF,92.427481,Date,112,3027
32,Little RTBM in Matchforce,58.137405,Date,2,1904
33,Little Moved to RTBM in MF,58.137405,Date,1,1904
34,Little Application Received,58.137405,Date,766,1904
36,Little Interview Date,58.229008,Date,731,1907


##### ID

In [1151]:
missing[missing["field_type"] == 'ID']

Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
51,Little Mailing Address Census Block Group,29.496183,ID,959,966
52,Big Home Census Block Group,30.687023,ID,1163,1005
53,Big Employer/School Census Block Group,96.427481,ID,91,3158


##### Categorical List
1. Combine interests into column
2. use both language and ethnicity to impute each other
3. impute little race/ethnicity from little contaxt languages spoken 
4. impute gender

In [1152]:
missing[missing["field_type"] == 'Categorical, List']

Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
4,Big Languages,46.167939,"Categorical, List",29,1512
7,Big Race/Ethnicity,1.007634,"Categorical, List",20,33
23,Big Contact: Interest Finder - Sports,99.175573,"Categorical, List",23,3248
24,Big Contact: Interest Finder - Places To Go,99.175573,"Categorical, List",11,3248
25,Big Contact: Interest Finder - Hobbies,99.175573,"Categorical, List",22,3248
26,Big Contact: Interest Finder - Entertainment,99.175573,"Categorical, List",20,3248
29,Big Contact: Volunteer Availability,91.541985,"Categorical, List",13,2998
35,Little Contact: Language(s) Spoken,98.839695,"Categorical, List",3,3237
38,Little Contact: Interest Finder - Sports,97.557252,"Categorical, List",46,3195
39,Little Contact: Interest Finder - Outdoors,97.557252,"Categorical, List",41,3195


##### Numerical

In [1153]:
missing[missing["field_type"] == 'Numerical']

Unnamed: 0,Feature,prcnt_missing,field_type,nuniq,nmissing
16,Big Days Acceptance to Match,73.618321,Numerical,266,2411
17,Big Days Interview to Acceptance,58.10687,Numerical,173,1903
18,Big Days Interview to Match,73.618321,Numerical,306,2411


## Addressing Missing Data

### STEPS IN DEALING WITH MISSING DATA
0. Drop irrelevant columns
- Drop any columns that are not important for modeling
1. Data Cleaning
- Receode Missingnes
- Check if any missing values can be computed through other values (i.e. county from census code)
- Label missing Text data as missing
2. Imputational Methods:
- Convert Data to Categorical or Numerical
- Perform MICE
- plot distributions of imputed data to assess effectiveness

#### 0. Drop Irrelevant Missing Columns

In [1154]:
#Non-Missing Columns
# note: Convert Program to Categorical variable and use date,Categorical,and numerical to impute (remove ID for data imputation)

# Match ID
# Stage	- Categorical
# Little ID
# Big ID
# Big Age
# Big Birthdate
# Program -	Text
# Program Type -Categorical
# Match Activation Date	- Date
# Little Birthdate	- 	Date
# Match Length	-	Numerical


#Missing Columns
# drop irrelevant text fields
# 	NOT DROPPED: Big County, Rationale for Match
#note: clean county prior to imputing missing data
# fill Rationale for Match missing values with "missing" and remove for data imputation
df=df.drop(columns=["Big Employer","Closure Details"])


#drop interest finder fields
df=df.drop(columns=["Big Contact: Interest Finder - Entertainment","Big Contact: Interest Finder - Hobbies","Big Contact: Interest Finder - Places To Go",
                    "Big Contact: Interest Finder - Sports","Little Contact: Interest Finder - Arts","Little Contact: Interest Finder - Career",
                    "Little Contact: Interest Finder - Entertainment", "Little Contact: Interest Finder - Hobbies", "Little Contact: Interest Finder - Other Interests",
                    "Little Contact: Interest Finder - Outdoors","Little Contact: Interest Finder - Personality","Little Contact: Interest Finder - Places To Go",
                    "Little Contact: Interest Finder - Sports","Little Contact: Interest Finder - Three Wishes","Little Other Interests",
                    ])

#drop irrelevant categorical fields
# NOT DROPPED:  Big Occupation, Big Gender, Big Contact: Marital Status, Closure Reason

df=df.drop(columns=["Big Level of Education",'Big Enrollment: Record Type',"Big Car Access", "Big Open to Cross-Gender Match", "Big Re-Enroll", 
                    "Big Contact: Preferred Communication Type", "Big Contact: Former Big/Little","Big Approved Date","Big: Military"])


#drop irrelevant dates
# NOT DROPPED: 
# note: create Match Closure Date which is computed from the Match length

df=df.drop(columns=["Big Assessment Uploaded","Big Acceptance Date","Big Contact: Created Date","Big Enrollment: Created Date",
                    "Little RTBM Date in MF","Little RTBM in Matchforce","Little Moved to RTBM in MF","Little Application Received","Little Interview Date",
                    "Little Acceptance Date","Match Closure Meeting Date"])

#drop irrelevant IDs
# NOT DROPPED: Little Mailing Address Census Block Group (convert to little county then drop)
# NOT DROPPED: "Big Home Census Block Group" (use to fill missing county values then remove)

df=df.drop(columns=["Big Employer/School Census Block Group"])

#drop irrelevant categorical lists
# NOT DROPPED: Big Languages,Big Race/Ethnicity,Little Gender

df=df.drop(columns=["Big Contact: Volunteer Availability", "Little Contact: Language(s) Spoken"])

#drop irrelevant numerical  fields
# Not Dropped: Days Acceptance to Match,Big Days Interview to Acceptance,Big Days Interview to Match


# FOR NOW DROP GEOGRAPHICAL MARKERS
df=df.drop(columns=["Big County","Little Mailing Address Census Block Group","Big Home Census Block Group"])

pd.options.display.max_columns = None
df.head()


Unnamed: 0,Match ID 18Char,Stage,Little ID,Big ID,Big Age,Big Occupation,Big Languages,Big Gender,Big Birthdate,Program,Program Type,Big Race/Ethnicity,Closure Reason,Match Activation Date,Rationale for Match,Big Days Acceptance to Match,Big Days Interview to Acceptance,Big Days Interview to Match,Big Contact: Marital Status,Little Gender,Little Participant: Race/Ethnicity,Little Birthdate,Match Length
0,a1v2J0000028pRvQAI,Closed,0032J00003PLe29QAD,0032J00003PhDOI,40,Unemployed,No Preference,Female,1985-02-01,General Community,Community,Black or African American;White or Caucasian;,Volunteer: Health,2017-03-03,,,,,,Female,Black or African American,2004-01-01,9.0
1,a1v2J000002uR0JQAU,Closed,0032J00003PfZ6OQAV,0032J00003PgoV1,65,Tech: Research/Design,No Preference,Female,1959-05-01,General Community,Community,White or Caucasian;,Child/Family: Feels incompatible with volunteer,2018-04-12,Their shared interests include spending time o...,,,,,Female,Black or African American; White or Caucasian,2006-06-01,46.1
2,a1v2J0000027NsOQAU,Closed,0032J00003PLeoRQAT,0032J00003Ph0MT,45,Military,Chinese,Male,1979-07-01,General Community,Community,Asian;,Volunteer: Moved,2017-03-23,,,,,,Male,Black or African American; White or Caucasian,2007-01-01,6.2
3,a1v2J0000027dtOQAQ,Active,0032J00003PLeoRQAT,0032J00003Ph14N,61,Finance: Banking,No Preference,Male,1963-11-01,General Community,Community,White or Caucasian;,,2018-01-11,B_first_name and L_first_name were matched bec...,,,,,Male,Black or African American; White or Caucasian,2007-01-01,85.6
4,a1v2J0000028enKQAQ,Closed,0032J00003PfZ6QQAV,0032J00003Ph14j,29,Human Services: Non-Profit,No Preference,Female,1996-01-01,General Community,Community,White or Caucasian;,Child/Family: Moved,2018-04-13,Shared interests like being creative and tryin...,,,,,Female,Hispanic,2005-01-01,28.3


#### 1. Data Cleaning

##### TEXT DATA
Rationale for Match: Convert all NaN values to "Not Specified"

In [1155]:
#Rationale for Match
df['Rationale for Match']=df['Rationale for Match'].fillna('Not Specified')
df.head()

Unnamed: 0,Match ID 18Char,Stage,Little ID,Big ID,Big Age,Big Occupation,Big Languages,Big Gender,Big Birthdate,Program,Program Type,Big Race/Ethnicity,Closure Reason,Match Activation Date,Rationale for Match,Big Days Acceptance to Match,Big Days Interview to Acceptance,Big Days Interview to Match,Big Contact: Marital Status,Little Gender,Little Participant: Race/Ethnicity,Little Birthdate,Match Length
0,a1v2J0000028pRvQAI,Closed,0032J00003PLe29QAD,0032J00003PhDOI,40,Unemployed,No Preference,Female,1985-02-01,General Community,Community,Black or African American;White or Caucasian;,Volunteer: Health,2017-03-03,Not Specified,,,,,Female,Black or African American,2004-01-01,9.0
1,a1v2J000002uR0JQAU,Closed,0032J00003PfZ6OQAV,0032J00003PgoV1,65,Tech: Research/Design,No Preference,Female,1959-05-01,General Community,Community,White or Caucasian;,Child/Family: Feels incompatible with volunteer,2018-04-12,Their shared interests include spending time o...,,,,,Female,Black or African American; White or Caucasian,2006-06-01,46.1
2,a1v2J0000027NsOQAU,Closed,0032J00003PLeoRQAT,0032J00003Ph0MT,45,Military,Chinese,Male,1979-07-01,General Community,Community,Asian;,Volunteer: Moved,2017-03-23,Not Specified,,,,,Male,Black or African American; White or Caucasian,2007-01-01,6.2
3,a1v2J0000027dtOQAQ,Active,0032J00003PLeoRQAT,0032J00003Ph14N,61,Finance: Banking,No Preference,Male,1963-11-01,General Community,Community,White or Caucasian;,,2018-01-11,B_first_name and L_first_name were matched bec...,,,,,Male,Black or African American; White or Caucasian,2007-01-01,85.6
4,a1v2J0000028enKQAQ,Closed,0032J00003PfZ6QQAV,0032J00003Ph14j,29,Human Services: Non-Profit,No Preference,Female,1996-01-01,General Community,Community,White or Caucasian;,Child/Family: Moved,2018-04-13,Shared interests like being creative and tryin...,,,,,Female,Hispanic,2005-01-01,28.3


Big County: Clean County Column (Complete Later)

In [1156]:
#all unique county inputs
# df["Big County"].unique()


In [1157]:
#fix spelling/format errors
# df = df.replace(to_replace=['Hennepin County','Henepin','Hennpin'],value='Hennepin')
# df = df.replace(to_replace=['St. Croix County, WI', 'St. Croix County'],value='St. Croix')
# df = df.replace('New Castle County','New Castle')
# df = df.replace('Ramesy','Ramsey')
# df = df.replace('Rice County','Rice')
# df = df.replace(to_replace=['St. Croix County, WI', 'St. Croix County'],value='St. Croix')


# #remove invalid counties
# invalid_counties=['Testing County', 'MN', 'Outside state', 'mn', 'US', 'United States',
#        'United States of America', 'California', 'Other']

# df = df.replace(to_replace=invalid_counties,value=np.nan)

# df["Big County"].unique()



In [1158]:
# LATER Compute missing counties from census code
# import requests
# import csv

# def getCounties():
#     "Function to return a dict of FIPS codes (keys) of U.S. counties (values)"
#     d = {}
#     r = requests.get("http://www2.census.gov/geo/docs/reference/codes/files/national_county.txt")
#     reader = csv.reader(r.text.splitlines(), delimiter=',')    
#     for line in reader:
#         d[line[1] + line[2]] = line[3].replace(" County","")    
#     return d


##### CATEGORICAL DATA

Closure Reason: If a match is Active write "Still Active"

In [1159]:

df.loc[df['Stage'] == 'Active', 'Closure Reason'] = 'Still Active'

df.head()


Unnamed: 0,Match ID 18Char,Stage,Little ID,Big ID,Big Age,Big Occupation,Big Languages,Big Gender,Big Birthdate,Program,Program Type,Big Race/Ethnicity,Closure Reason,Match Activation Date,Rationale for Match,Big Days Acceptance to Match,Big Days Interview to Acceptance,Big Days Interview to Match,Big Contact: Marital Status,Little Gender,Little Participant: Race/Ethnicity,Little Birthdate,Match Length
0,a1v2J0000028pRvQAI,Closed,0032J00003PLe29QAD,0032J00003PhDOI,40,Unemployed,No Preference,Female,1985-02-01,General Community,Community,Black or African American;White or Caucasian;,Volunteer: Health,2017-03-03,Not Specified,,,,,Female,Black or African American,2004-01-01,9.0
1,a1v2J000002uR0JQAU,Closed,0032J00003PfZ6OQAV,0032J00003PgoV1,65,Tech: Research/Design,No Preference,Female,1959-05-01,General Community,Community,White or Caucasian;,Child/Family: Feels incompatible with volunteer,2018-04-12,Their shared interests include spending time o...,,,,,Female,Black or African American; White or Caucasian,2006-06-01,46.1
2,a1v2J0000027NsOQAU,Closed,0032J00003PLeoRQAT,0032J00003Ph0MT,45,Military,Chinese,Male,1979-07-01,General Community,Community,Asian;,Volunteer: Moved,2017-03-23,Not Specified,,,,,Male,Black or African American; White or Caucasian,2007-01-01,6.2
3,a1v2J0000027dtOQAQ,Active,0032J00003PLeoRQAT,0032J00003Ph14N,61,Finance: Banking,No Preference,Male,1963-11-01,General Community,Community,White or Caucasian;,Still Active,2018-01-11,B_first_name and L_first_name were matched bec...,,,,,Male,Black or African American; White or Caucasian,2007-01-01,85.6
4,a1v2J0000028enKQAQ,Closed,0032J00003PfZ6QQAV,0032J00003Ph14j,29,Human Services: Non-Profit,No Preference,Female,1996-01-01,General Community,Community,White or Caucasian;,Child/Family: Moved,2018-04-13,Shared interests like being creative and tryin...,,,,,Female,Hispanic,2005-01-01,28.3


##### DATES
Match Closure Date: Use match length to create a match closure date

In [1160]:
#LATER

##### IDs

Little Mailing Address Census Block: Use to create Little County

In [1161]:
#LATER

#### 2. Imputational Methods

Remove ID and Text Columns

In [1162]:
df=df.drop(columns=['Match ID 18Char','Little ID','Big ID', 'Rationale for Match'])

Convert all Dates into Numerical Values

In [1163]:
#Convert every Date into a numerical value which denotes a count from a specific start date
from datetime import datetime
from datetime import date
# import datetime                  
Date_columns=['Big Birthdate','Match Activation Date','Little Birthdate']


for col in Date_columns:
    df[col]= pd.to_datetime(df[col])


#create Activation year month date
df['Match Activation Year']=df['Match Activation Date'].dt.year
df['Match Activation Month']=df['Match Activation Date'].dt.month
df['Match Activation Day']=df['Match Activation Date'].dt.day

#calculate little  age
# def calculate_age(born,match):
#     born=datetime.strptime(born,"%Y-%m-%d").date()
#     match=datetime.strptime(match,"%Y-%m-%d").date()
#     return match.year -born.year - ((match.month,match.day)<(born.month,born.day))

# for birth in df['Little Birthday']
# df['Little Age']=calculate_age()

#df.head()


#FOR NOW: DROP DATES
df=df.drop(columns=['Match Activation Date',"Big Birthdate",'Little Birthdate'])
df.head()


Unnamed: 0,Stage,Big Age,Big Occupation,Big Languages,Big Gender,Program,Program Type,Big Race/Ethnicity,Closure Reason,Big Days Acceptance to Match,Big Days Interview to Acceptance,Big Days Interview to Match,Big Contact: Marital Status,Little Gender,Little Participant: Race/Ethnicity,Match Length,Match Activation Year,Match Activation Month,Match Activation Day
0,Closed,40,Unemployed,No Preference,Female,General Community,Community,Black or African American;White or Caucasian;,Volunteer: Health,,,,,Female,Black or African American,9.0,2017,3,3
1,Closed,65,Tech: Research/Design,No Preference,Female,General Community,Community,White or Caucasian;,Child/Family: Feels incompatible with volunteer,,,,,Female,Black or African American; White or Caucasian,46.1,2018,4,12
2,Closed,45,Military,Chinese,Male,General Community,Community,Asian;,Volunteer: Moved,,,,,Male,Black or African American; White or Caucasian,6.2,2017,3,23
3,Active,61,Finance: Banking,No Preference,Male,General Community,Community,White or Caucasian;,Still Active,,,,,Male,Black or African American; White or Caucasian,85.6,2018,1,11
4,Closed,29,Human Services: Non-Profit,No Preference,Female,General Community,Community,White or Caucasian;,Child/Family: Moved,,,,,Female,Hispanic,28.3,2018,4,13


Convert all Categorical Data into Numerical Data

In [1164]:

 
# #Convert Categorical Data into Numerical data
category_cols=['Stage','Big Occupation','Big Languages','Big Gender','Program','Program Type','Big Race/Ethnicity','Closure Reason','Big Contact: Marital Status',
               'Little Gender','Little Participant: Race/Ethnicity','Closure Reason','Match Activation Year', 'Match Activation Month']

# savedVals=pd.DataFrame()

for col in category_cols:
    df[col]=df[col].astype('category')
    # my_list = df[col].cat.codes
    # my_values = df[col].tolist()    
    df[col]=df[col].cat.codes

df = df.replace(to_replace=-1,value=np.nan)


#Change the feature names
df.columns = df.columns.str.replace('[/,:, ]', '_', regex=True)

df.head()

Unnamed: 0,Stage,Big_Age,Big_Occupation,Big_Languages,Big_Gender,Program,Program_Type,Big_Race_Ethnicity,Closure_Reason,Big_Days_Acceptance_to_Match,Big_Days_Interview_to_Acceptance,Big_Days_Interview_to_Match,Big_Contact__Marital_Status,Little_Gender,Little_Participant__Race_Ethnicity,Match_Length,Match_Activation_Year,Match_Activation_Month,Match_Activation_Day
0,1,40,85.0,23.0,0.0,9,0,10.0,25,,,,,0.0,10.0,9.0,0,2,3
1,1,65,78.0,23.0,0.0,9,0,19.0,5,,,,,0.0,16.0,46.1,1,3,12
2,1,45,61.0,7.0,2.0,9,0,5.0,30,,,,,2.0,16.0,6.2,0,2,23
3,0,61,29.0,23.0,2.0,9,0,19.0,20,,,,,2.0,16.0,85.6,1,0,11
4,1,29,40.0,23.0,0.0,9,0,19.0,10,,,,,0.0,17.0,28.3,1,3,13


Conduct Data Imputation

In [1165]:
# Conduct MICE on the dataset 
import miceforest as mf

# Create kernel. 
kds = mf.ImputationKernel(
  df,
  random_state=100
)

# Run the MICE algorithm for 2 iterations
kds.mice(2)

# Return the completed dataset.
df_imputed = kds.complete_data()

df_imputed.head()

Unnamed: 0,Stage,Big_Age,Big_Occupation,Big_Languages,Big_Gender,Program,Program_Type,Big_Race_Ethnicity,Closure_Reason,Big_Days_Acceptance_to_Match,Big_Days_Interview_to_Acceptance,Big_Days_Interview_to_Match,Big_Contact__Marital_Status,Little_Gender,Little_Participant__Race_Ethnicity,Match_Length,Match_Activation_Year,Match_Activation_Month,Match_Activation_Day
0,1,40,85.0,23.0,0.0,9,0,10.0,25,332.0,219.0,105.0,5.0,0.0,10.0,9.0,0,2,3
1,1,65,78.0,23.0,0.0,9,0,19.0,5,64.0,197.0,212.0,3.0,0.0,16.0,46.1,1,3,12
2,1,45,61.0,7.0,2.0,9,0,5.0,30,78.0,78.0,265.0,3.0,2.0,16.0,6.2,0,2,23
3,0,61,29.0,23.0,2.0,9,0,19.0,20,99.0,57.0,128.0,3.0,2.0,16.0,85.6,1,0,11
4,1,29,40.0,23.0,0.0,9,0,19.0,10,55.0,51.0,121.0,6.0,0.0,17.0,28.3,1,3,13


In [1166]:
#convert numerical categories back to categories

Revert Dates, Categorical variables, and add text columns to create final clean dataset

# Merge Imputed Novice Data Set with Training set and create Imputed_Train

Merge the cleaned novice data wit the training set