In [132]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import warnings


from missforest import MissForest

warnings.simplefilter("ignore")

In [133]:
df = pd.read_excel("../../data/Novice.xlsx")
df.head(3)

Unnamed: 0,Match ID 18Char,Stage,Little ID,Big ID,Big County,Big Age,Big Occupation,Big: Military,Big Approved Date,Big Level of Education,...,Little Contact: Interest Finder - Career,Little Contact: Interest Finder - Personality,Little Contact: Interest Finder - Three Wishes,Little Gender,Little Participant: Race/Ethnicity,Little Birthdate,Little Mailing Address Census Block Group,Big Home Census Block Group,Big Employer/School Census Block Group,Match Length
0,a1v2J0000028pRvQAI,Closed,0032J00003PLe29QAD,0032J00003PhDOI,Hennepin,40,Unemployed,,NaT,Masters Degree,...,,,,Female,Black or African American,2004-01-01,270530300000.0,270530200000.0,,9.0
1,a1v2J000002uR0JQAU,Closed,0032J00003PfZ6OQAV,0032J00003PgoV1,Washington,65,Tech: Research/Design,,2018-04-11,,...,,,,Female,Black or African American; White or Caucasian,2006-06-01,271630700000.0,271630700000.0,,46.1
2,a1v2J0000027NsOQAU,Closed,0032J00003PLeoRQAT,0032J00003Ph0MT,Ramsey,45,Military,,NaT,Bachelors Degree,...,,,,Male,Black or African American; White or Caucasian,2007-01-01,270030500000.0,271230400000.0,,6.2


In [134]:
# columns to be modified
# todo: remove all "contact" and use _
# these columns are either not relevant for match length or missing enormous amount of values (might use then for eda)
to_be_deleted = [
    "Big Employer/School Census Block Group", "Big Enrollment: Created Date", "Big Acceptance Date",
    "Big Contact: Created Date","Big Days Acceptance to Match", "Big Days Interview to Acceptance","Big Days Interview to Match",
    "Big Contact: Preferred Communication Type", "Big Assessment Uploaded", "Big Enrollment: Created Date",
    "Big Employer", "Big Employer/School Census Block Group", "Big Approved Date", "Big Home Census Block Group", "Big Enrollment: Record Type",

    "Big Contact: Interest Finder - Entertainment","Big Contact: Interest Finder - Hobbies","Big Contact: Interest Finder - Places To Go",
    "Big Contact: Interest Finder - Sports","Little Contact: Interest Finder - Arts","Little Contact: Interest Finder - Career",
    "Little Contact: Interest Finder - Entertainment", "Little Contact: Interest Finder - Hobbies", "Little Contact: Interest Finder - Other Interests",
    "Little Contact: Interest Finder - Outdoors","Little Contact: Interest Finder - Personality","Little Contact: Interest Finder - Places To Go",
    "Little Contact: Interest Finder - Sports","Little Contact: Interest Finder - Three Wishes","Little Other Interests",
    "Little Contact: Language(s) Spoken", "Big Contact: Former Big/Little", "Big Level of Education", "Big: Military",
    "Big Languages", "Big Car Access", "Big Open to Cross-Gender Match", "Big Contact: Volunteer Availability", "Big Contact: Marital Status", "Big Re-Enroll", "Big County",
                    

    "Big ID", "Little ID", "Stage", "Closure Reason", "Closure Details", "Match Closure Meeting Date", "Big Assessment Uploaded", 
    "Little Mailing Address Census Block Group", "Little Interview Date", "Little Acceptance Date", "Little Application Received", "Little Moved to RTBM in MF",
    "Little RTBM Date in MF", "Little RTBM in Matchforce", "Little Moved to RTBM in MF", "Little Interview Date", "Little Acceptance Date", "Little RTBM in Matchforce",
]

yes_no = [
    "Big Open to Cross-Gender Match", "Big Re-Enroll", "Big Contact: Marital Status", "Big Gender", "Big Military", "Big Car Access",
    "Big Contact: Former Big/Little", 
] 

In [135]:
df.drop(to_be_deleted, axis=1, inplace=True)
df.shape

(3275, 14)

In [136]:
df.columns

Index(['Match ID 18Char', 'Big Age', 'Big Occupation', 'Big Gender',
       'Big Birthdate', 'Program', 'Program Type', 'Big Race/Ethnicity',
       'Match Activation Date', 'Rationale for Match', 'Little Gender',
       'Little Participant: Race/Ethnicity', 'Little Birthdate',
       'Match Length'],
      dtype='object')

In [137]:
df.isna().sum()

Match ID 18Char                         0
Big Age                                 0
Big Occupation                        325
Big Gender                              1
Big Birthdate                           0
Program                                 0
Program Type                            0
Big Race/Ethnicity                     33
Match Activation Date                   0
Rationale for Match                   299
Little Gender                           1
Little Participant: Race/Ethnicity    105
Little Birthdate                        0
Match Length                            0
dtype: int64

In [138]:
df.dtypes

Match ID 18Char                               object
Big Age                                        int64
Big Occupation                                object
Big Gender                                    object
Big Birthdate                         datetime64[ns]
Program                                       object
Program Type                                  object
Big Race/Ethnicity                            object
Match Activation Date                 datetime64[ns]
Rationale for Match                           object
Little Gender                                 object
Little Participant: Race/Ethnicity            object
Little Birthdate                      datetime64[ns]
Match Length                                 float64
dtype: object

In [139]:
# convert all column to lower case
df.columns = (
    df.columns.str.strip()  # Remove leading/trailing spaces
               .str.replace(':', '_', regex=True)  # Replace colons (`:`) with underscores
               .str.replace(r'\s+', '_', regex=True)  # Replace any spaces with underscores
)
df.columns = [
    x.lower().\
        replace("contact: ", "").\
        replace("finder - ", "").\
        replace("/","_").\
        replace("_18char","").\
        replace(" ", "_")
    for x in df.columns]
df.columns

Index(['match_id', 'big_age', 'big_occupation', 'big_gender', 'big_birthdate',
       'program', 'program_type', 'big_race_ethnicity',
       'match_activation_date', 'rationale_for_match', 'little_gender',
       'little_participant__race_ethnicity', 'little_birthdate',
       'match_length'],
      dtype='object')

In [140]:
# Identify categorical and datetime columns
categorical_cols = df.select_dtypes(include=['object']).columns
datetime_cols = df.select_dtypes(include=['datetime64']).columns

# Store the datetime column separately
datetime_df = df[datetime_cols]

# Drop the datetime column before imputation
df = df.drop(columns=datetime_cols)

# Convert categorical columns to 'category' dtype
for col in categorical_cols:
    df[col] = df[col].astype('category')

# Save category mappings for conversion after imputation
category_mappings = {col: dict(enumerate(df[col].cat.categories)) for col in categorical_cols}

# Convert categorical columns to numerical codes
for col in categorical_cols:
    df[col] = df[col].cat.codes
    df[col].replace(-1, np.nan, inplace=True)  # Keep missing values as NaN

# Initialize MissForest imputer
imputer = MissForest()

# Perform imputation
imputed_df = imputer.fit_transform(df)

# Convert back to dfFrame
imputed_df = pd.DataFrame(imputed_df, columns=df.columns)

# Convert categorical columns back to original categories
for col in categorical_cols:
    imputed_df[col] = imputed_df[col].round().astype(int)
    imputed_df[col] = imputed_df[col].map(category_mappings[col])

# Add the datetime column back
imputed_df = pd.concat([imputed_df, datetime_df.reset_index(drop=True)], axis=1)

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:43<00:00,  8.70s/it]
100%|██████████| 5/5 [00:00<00:00, 35.74it/s]


In [141]:
imputed_df.isna().sum()

match_id                              0
big_age                               0
big_occupation                        0
big_gender                            0
program                               0
program_type                          0
big_race_ethnicity                    2
rationale_for_match                   0
little_gender                         0
little_participant__race_ethnicity    0
match_length                          0
big_birthdate                         0
match_activation_date                 0
little_birthdate                      0
dtype: int64

In [142]:
imputed_df.dtypes

match_id                                      object
big_age                                        int64
big_occupation                                object
big_gender                                    object
program                                       object
program_type                                  object
big_race_ethnicity                            object
rationale_for_match                           object
little_gender                                 object
little_participant__race_ethnicity            object
match_length                                 float64
big_birthdate                         datetime64[ns]
match_activation_date                 datetime64[ns]
little_birthdate                      datetime64[ns]
dtype: object

In [143]:
imputed_df['big_race_ethnicity'].dropna(inplace=True, axis=0)
imputed_df.shape

(3275, 14)

In [144]:
imputed_df.head()

Unnamed: 0,match_id,big_age,big_occupation,big_gender,program,program_type,big_race_ethnicity,rationale_for_match,little_gender,little_participant__race_ethnicity,match_length,big_birthdate,match_activation_date,little_birthdate
0,a1v2J0000028pRvQAI,40,Unemployed,Female,General Community,Community,Black or African American;White or Caucasian;,"Distance: 2.3 mi 9 min. Shared Traits: nice, e...",Female,Black or African American,9.0,1985-02-01,2017-03-03,2004-01-01
1,a1v2J000002uR0JQAU,65,Tech: Research/Design,Female,General Community,Community,White or Caucasian;,Their shared interests include spending time o...,Female,Black or African American; White or Caucasian,46.1,1959-05-01,2018-04-12,2006-06-01
2,a1v2J0000027NsOQAU,45,Military,Male,General Community,Community,Asian;,Both B_first_name and L_first_name have positi...,Male,Black or African American; White or Caucasian,6.2,1979-07-01,2017-03-23,2007-01-01
3,a1v2J0000027dtOQAQ,61,Finance: Banking,Male,General Community,Community,White or Caucasian;,B_first_name and L_first_name were matched bec...,Male,Black or African American; White or Caucasian,85.6,1963-11-01,2018-01-11,2007-01-01
4,a1v2J0000028enKQAQ,29,Human Services: Non-Profit,Female,General Community,Community,White or Caucasian;,Shared interests like being creative and tryin...,Female,Hispanic,28.3,1996-01-01,2018-04-13,2005-01-01


In [145]:
imputed_df.to_csv("../../data/interim/imputed_df.xlsx", index=False)

In [146]:
data = pd.read_excel("../../data/Training.xlsx")

In [147]:
data.head()

Unnamed: 0,Match ID 18Char,Completion Date,Match Support Contact Notes,Stage,Little ID,Big ID,Big County,Big Age,Big Occupation,Big: Military,...,Little Contact: Interest Finder - Career,Little Contact: Interest Finder - Personality,Little Contact: Interest Finder - Three Wishes,Little Gender,Little Participant: Race/Ethnicity,Little Birthdate,Little Mailing Address Census Block Group,Big Home Census Block Group,Big Employer/School Census Block Group,Match Length
0,a1v2J0000028pRvQAI,2017-11-30,Question: Activities: Answer: See ms...,Closed,0032J00003PLe29QAD,0032J00003PhDOI,Hennepin,40,Unemployed,,...,,,,Female,Black or African American,2004-01-01,270530300000.0,270530200000.0,,9.0
1,a1v2J0000028pRvQAI,2017-10-31,Question: Activities: Answer: See MS...,Closed,0032J00003PLe29QAD,0032J00003PhDOI,Hennepin,40,Unemployed,,...,,,,Female,Black or African American,2004-01-01,270530300000.0,270530200000.0,,9.0
2,a1v2J0000028pRvQAI,2017-12-01,Question: Activities: Answer: Match ...,Closed,0032J00003PLe29QAD,0032J00003PhDOI,Hennepin,40,Unemployed,,...,,,,Female,Black or African American,2004-01-01,270530300000.0,270530200000.0,,9.0
3,a1v2J0000028pRvQAI,2017-05-29,Question: Activities: Answer: Msc as...,Closed,0032J00003PLe29QAD,0032J00003PhDOI,Hennepin,40,Unemployed,,...,,,,Female,Black or African American,2004-01-01,270530300000.0,270530200000.0,,9.0
4,a1v2J0000028pRvQAI,2017-04-30,Question: Activities: Answer: Msc as...,Closed,0032J00003PLe29QAD,0032J00003PhDOI,Hennepin,40,Unemployed,,...,,,,Female,Black or African American,2004-01-01,270530300000.0,270530200000.0,,9.0


In [156]:
sample = data[['Match ID 18Char', "Completion Date", "Match Support Contact Notes"]]
sample.rename({
    'Match ID 18Char': "match_id",
    'Completion Date':'completion_date',
    "Match Support Contact Notes":"contact_notes"
},axis=1, inplace=True)
sample.head()

Unnamed: 0,match_id,completion_date,contact_notes
0,a1v2J0000028pRvQAI,2017-11-30,Question: Activities: Answer: See ms...
1,a1v2J0000028pRvQAI,2017-10-31,Question: Activities: Answer: See MS...
2,a1v2J0000028pRvQAI,2017-12-01,Question: Activities: Answer: Match ...
3,a1v2J0000028pRvQAI,2017-05-29,Question: Activities: Answer: Msc as...
4,a1v2J0000028pRvQAI,2017-04-30,Question: Activities: Answer: Msc as...


In [157]:
sample.isna().sum()

match_id              0
completion_date     147
contact_notes      1454
dtype: int64

In [158]:
sample.dropna(axis=0, inplace=True)


In [159]:
sample.isna().sum()

match_id           0
completion_date    0
contact_notes      0
dtype: int64

In [160]:
join_df = pd.merge(sample, imputed_df, how='left', on='match_id')

In [161]:
join_df.head()

Unnamed: 0,match_id,completion_date,contact_notes,big_age,big_occupation,big_gender,program,program_type,big_race_ethnicity,rationale_for_match,little_gender,little_participant__race_ethnicity,match_length,big_birthdate,match_activation_date,little_birthdate
0,a1v2J0000028pRvQAI,2017-11-30,Question: Activities: Answer: See ms...,40,Unemployed,Female,General Community,Community,Black or African American;White or Caucasian;,"Distance: 2.3 mi 9 min. Shared Traits: nice, e...",Female,Black or African American,9.0,1985-02-01,2017-03-03,2004-01-01
1,a1v2J0000028pRvQAI,2017-10-31,Question: Activities: Answer: See MS...,40,Unemployed,Female,General Community,Community,Black or African American;White or Caucasian;,"Distance: 2.3 mi 9 min. Shared Traits: nice, e...",Female,Black or African American,9.0,1985-02-01,2017-03-03,2004-01-01
2,a1v2J0000028pRvQAI,2017-12-01,Question: Activities: Answer: Match ...,40,Unemployed,Female,General Community,Community,Black or African American;White or Caucasian;,"Distance: 2.3 mi 9 min. Shared Traits: nice, e...",Female,Black or African American,9.0,1985-02-01,2017-03-03,2004-01-01
3,a1v2J0000028pRvQAI,2017-05-29,Question: Activities: Answer: Msc as...,40,Unemployed,Female,General Community,Community,Black or African American;White or Caucasian;,"Distance: 2.3 mi 9 min. Shared Traits: nice, e...",Female,Black or African American,9.0,1985-02-01,2017-03-03,2004-01-01
4,a1v2J0000028pRvQAI,2017-04-30,Question: Activities: Answer: Msc as...,40,Unemployed,Female,General Community,Community,Black or African American;White or Caucasian;,"Distance: 2.3 mi 9 min. Shared Traits: nice, e...",Female,Black or African American,9.0,1985-02-01,2017-03-03,2004-01-01


In [163]:
join_df.isna().sum()

match_id                              0
completion_date                       0
contact_notes                         0
big_age                               0
big_occupation                        0
big_gender                            0
program                               0
program_type                          0
big_race_ethnicity                    6
rationale_for_match                   0
little_gender                         0
little_participant__race_ethnicity    0
match_length                          0
big_birthdate                         0
match_activation_date                 0
little_birthdate                      0
dtype: int64

In [164]:
join_df.to_excel("../../data/interim/final.xlsx", index=False)