In [60]:
import pandas as pd

def parse_gedcom(file_path):
    individuals = {}
    current_individual = None
    current_individual_data = {}

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line.startswith('0 @I'):
                if current_individual is not None:
                    individuals[current_individual] = current_individual_data
                    current_individual_data = {}
                current_individual = line.split('@')[1]
            elif line.startswith('1'):
                current_tag = line.split(' ')[1]
                value = line.split(' ')[2:]
                current_individual_data[current_tag] = value
                
            elif line.startswith('2'):
                add_tag = line.split(' ')[1]
                current_tag = current_tag + add_tag
                value = line.split(' ')[2:]
                current_individual_data[current_tag] = value              
                
            else:
                continue

        if current_individual is not None:
            individuals[current_individual] = current_individual_data

    return individuals

individuals = parse_gedcom('BothHarmonSniderTree.ged')

individual_data = []
for individual_id, individual in individuals.items():
    data = {'ID': individual_id}
    for tag, values in individual.items():
        data[tag] = ' '.join(values)
    individual_data.append(data)

individual_df = pd.DataFrame(individual_data)

date_columns = individual_df.columns[individual_df.columns.str.contains('DATE', case=False)]
for column in date_columns:
    individual_df[column] = individual_df[column].apply(lambda x: pd.to_datetime(x, format='%d %b %Y', errors='coerce') if pd.notna(x) and pd.to_datetime(x, format='%d %b %Y', errors='coerce').year != 1900 else x)
    individual_df[column] = individual_df[column].apply(lambda x: pd.to_datetime(x, format='%Y', errors='coerce') if pd.isnull(x) and (pd.isnull(x) or len(str(x)) == 4) else x)
    
# Get the list of datatypes for each column
column_datatypes = individual_df.dtypes

# Print the column names and their corresponding datatypes
for column, dtype in column_datatypes.items():
    print(f"Column: {column}, Datatype: {dtype}")

individual_df.to_excel('gedcom_data.xlsx', index=False)
individual_df.to_csv('gedcom_data.csv', index=False)

Column: ID, Datatype: object
Column: NAME, Datatype: object
Column: NAMEGIVN, Datatype: object
Column: NAMEGIVNSURN, Datatype: object
Column: SEX, Datatype: object
Column: _UID, Datatype: object
Column: _FSFTID, Datatype: object
Column: CHAN, Datatype: object
Column: CHANDATE, Datatype: datetime64[ns]
Column: BIRT, Datatype: object
Column: BIRTDATE, Datatype: object
Column: BIRTDATEPLAC, Datatype: datetime64[ns]
Column: FAMS, Datatype: object
Column: FAMC, Datatype: object
Column: BAPL, Datatype: object
Column: BAPLDATE, Datatype: object
Column: CONL, Datatype: object
Column: CONLDATE, Datatype: object
Column: WAC, Datatype: object
Column: WACDATE, Datatype: object
Column: WACDATETEMP, Datatype: datetime64[ns]
Column: ENDL, Datatype: object
Column: ENDLDATE, Datatype: object
Column: ENDLDATETEMP, Datatype: datetime64[ns]
Column: SLGC, Datatype: object
Column: SLGCDATE, Datatype: object
Column: SLGCDATETEMP, Datatype: datetime64[ns]
Column: DEAT, Datatype: object
Column: DEATDATE, Datat