# Data processing script for "Japanese-American Internee Data File"

This is a script that tries to parse the fixed-wdth 80 character data file ```RG210.JAPAN.WRA26.txt``` found at this page: [NAID: 1264228 https://catalog.archives.gov/id/1264228](https://catalog.archives.gov/id/1264228)

The data contains 109,400 rows of detailed data about the people of Japanese descent who were incarcerated against their will in ten "Internment camps" run by the US War Relocation Authority between 1942 and 1946. 

You can read the technical documentation in [this file](https://s3.amazonaws.com/NARAprodstorage/lz/electronic-records/rg-210/wra/102.1DP.pdf)

This is a work in progress! - Jon Keegan [https://github.com/jonkeegan](https://github.com/jonkeegan)

In [39]:
#!/usr/bin/env python3
import pandas as pd
import io

In [40]:
# NOTE: This is a work in progress!

# I dont have a good solution for the PI and ALPHA characters that are used as lookups in some cases. Working on this. 

# These are the fixed width character deliniations from the technical documentation  
# https://s3.amazonaws.com/NARAprodstorage/lz/electronic-records/rg-210/wra/102.1DP.pdf
# https://catalog.archives.gov/id/1264228

# Define field settings in one place
FIELD_SPECS = [
    ("last_name", 0, 10),
    ("first_name", 10, 18),
    ("middle_initial", 18, 19),
    ("relocation_center", 19, 20),
    ("assembly_center", 20, 21),
    ("previous_address", 21, 26),
    ("country_of_birth", 26, 27),
    ("fathers_occupation_in_us", 27, 28),
    ("fathers_occupation_abroad", 28,29),
    ("total_years_schooling_japan", 29, 30),
    ("years_schooling_in_japan_during", 30, 31),
    ("major_subject", 31, 32),
    ("year_first_arrival_us_foreign_born", 32, 34),
    ("total_time_in_japan", 34,35),
    ("number_times_in_japan", 35,36 ),
    ("age_at_time_in_japan", 36, 37),
    ("military_service_pensions_defects", 37,38 ),
    ("file_number", 38, 44),
    ("sex_marital_status", 44, 45),
    ("year_of_birth", 46, 48),
    ("birthplace", 48, 50),
    ("alien_reg_ssn_japanese_lang_school", 50, 51),
    ("highest_grade_completed", 51, 52),
    ("language", 52, 53),
    ("race_individual_and_spouse", 45, 46),
    ("religion", 53, 54)
]


In [41]:
def parse_line(line):
    """Parse a line using manual slicing based on FIELD_SPECS."""
    record = {}
    for field_name, start, end in FIELD_SPECS:
        value = line[start:] if end is None else line[start:end]
        record[field_name] = value.strip()  # Trim whitespace
    return record

def parse_with_manual_method(file_path):
    """Parse the file line-by-line using manual slicing."""
    records = []
    with open(file_path, 'r') as f:
        for line in f:
            if line.strip():  # skip empty lines
                records.append(parse_line(line))
    return records

def name_lookup(df, first_name, last_name):
    # Normalize the input names: trim whitespace and convert to lower-case
    fn = first_name.strip().lower()
    ln = last_name.strip().lower()
    
    # Filter rows where the first_name and last_name match
    result = df[
        (df["first_name"].str.strip().str.lower() == fn) &
        (df["last_name"].str.strip().str.lower() == ln)
    ]
    return result


def parse_with_pandas(file_path):
    
    
    # Code lookups - These are the code lookups from the technical documentation  
    # https://s3.amazonaws.com/NARAprodstorage/lz/electronic-records/rg-210/wra/102.1DP.pdf
    # https://catalog.archives.gov/id/1264228
    
    # described on page 65
    # https://s3.amazonaws.com/NARAprodstorage/lz/electronic-records/rg-210/wra/102.1DP.pdf
    # https://catalog.archives.gov/id/1264228
    
    # NOTE: the documentation often uses the "π" and 
    
    place_of_birth_dict = {
        "00": "UNITED STATES - UNSPECIFIED",
        "11": "WASHINGTON",
        "12": "OREGON",
        "13": "CALIFORNIA",
        "21": "NEW MEXICO",
        "22": "COLORADO",
        "23": "IDAHO",
        "24": "MONTANA",
        "25": "NEVADA",
        "26": "ARIZONA",
        "27": "UTAH",
        "28": "WYOMING",
        "31": "ALABAMA",
        "32": "ARKANSAS",
        "33": "KENTUCKY",
        "34": "LOUISIANA",
        "35": "MISSISSIPPI",
        "36": "OKLAHOMA",
        "37": "TENNESSEE",
        "38": "TEXAS",
        "41": "ILLINOIS",
        "42": "INDIANA",
        "43": "IOWA",
        "44": "KANSAS",
        "45": "MICHIGAN",
        "46": "MINNESOTA",
        "47": "MISSOURI",
        "48": "NEBRASKA",
        "49": "OHIO",
        "51": "DISTRICT IF COLUMBIA",
        "52": "DELAWARE",
        "53": "FLORIDA",
        "54": "GEORGIA",
        "55": "MARYLAND",
        "56": "NEW JERSEY",
        "57": "NEW YORK",
        "58": "NORTH CAROLINA",
        "59": "PENNSYLVANIA",
        "50": "SOUTH CAROLINA",
        "5 PI": "VIRGINIA",
        "5 _o_": "WEST VIRGINIA",
        "61": "CONNECTICUT",
        "62": "MAINE",
        "63": "MASSACHUSETTS",
        "64": "NEW HAMPSHIRE",
        "65": "RHODE ISLAND",
        "66": "VERMONT",
        "70": "HAWAII, UNSPECIFIED",
        "71": "HAWAII COUNTY",
        "72": "HONOLULU COUNTY",
        "73": "KAUAI COUNTY",
        "74": "MAUI COUNTY",
        "81": "ALASKA",
        "82": "CANADA",
        "83": "MEXICO",
        "84": "SOUTH AMERICA",
        "85": "AMERICAN SAMOA",
        "86": "GUAM",
        "87": "PUERTO RICO",
        "88": "VIRGIN ISLANDS",
        "89": "WAKE ISLAND",
        "80": "POLAND",
        "8 PI": "GERMANY",
        "8 _o_": "IRELAND",
        "90": "JAPAN, UNSPECIFIED",
        "91": "SAKHALIN or SAGHALIEN or KARAFUTO IS",
        "92": "HOKKAIDO or YEZU IS. HONSAU or HONSHU IS.",
        "93": "NORTHERN DIVISION",
        "94": "CENTRAL DIVISION",
        "95": "CENTRAL DIVISION",
        "96": "SOUTHERN DIVISION",
        "97": "URBAN PREFECTURES (KYOTO, OSAKA and TOKYO)",
        "98": "SHIKOKU IS.",
        "99": "KIUSHIU or KYUSHU IS.",
        "9 PI": "FORMOSA or TAIWAN IS.",
        "9 _0_": "CHOSEN or KOREA",
        "PI PI": "OTHER",
        "_0_ _0_": "UNKNOWN",
        "--":"NONE"
    }
    
    # described on page 53
    # https://s3.amazonaws.com/NARAprodstorage/lz/electronic-records/rg-210/wra/102.1DP.pdf
    # https://catalog.archives.gov/id/1264228
    
    relocation_center_dict = {
        "1":"Manzanar",
        "2":"Colorado River (Poston, Parker)",
        "3":"Gila River (Pima, Sacaton)",
        "4":"Tule Lake",
        "5":"Minidoka (Gooding)",
        "6":"Central Utah (Abraham)",
        "7":"Heart Mountain",
        "8":"Granada (X-Y Granada)",
        "9":"Rowler",
        "o":"Jerome"    
    }
    
    
    # described on page 53 
    # https://s3.amazonaws.com/NARAprodstorage/lz/electronic-records/rg-210/wra/102.1DP.pdf
    # https://catalog.archives.gov/id/1264228
    
    assembly_center_dict = {
        "o": "None",
        "1":"Manzanar (up to 6/1/42)",
        "2":"Fresno",
        "3":"Marysville (Arboga)",
        "4":"Mayor",
        "5":"Merced",
        "6":"Pinedale",
        "7":"Pomona",
        "8":"Portland (Pacific Int. Exposition Grounds)",
        "9":"Puyallup",
        "PI":"Sacramento (Walerga)",
        "A": "Salinas",
        "B": "Santa Anita",
        "C": "Stockton",
        "D": "Tanforan",
        "E": "Tulare",
        "F": "Turlock"
    }
    
    # described on page 53 
    # https://s3.amazonaws.com/NARAprodstorage/lz/electronic-records/rg-210/wra/102.1DP.pdf
    # https://catalog.archives.gov/id/1264228
    
    country_of_birth_dict = {
        "W": "Mother: Unknown, Father: Unknown",
        "B": "Mother: Unknown, Father: Japan",
        "K": "Mother: Unknown, Father: US",
        "T": "Mother: Unknown, Father: Hawaii",
        "Y": "Mother: Unknown, Father: Other",
        "A": "Mother: Japan, Father: Unknown",
        "1": "Mother: Japan, Father: Japan",
        "2": "Mother: Japan, Father: US",
        "3": "Mother: Japan, Father: Hawaii",
        "D": "Mother: Japan, Father: Other",
        "J": "Mother: US, Father: Unknown",
        "4": "Mother: US, Father: Japan",
        "5": "Mother: US, Father: US",
        "6": "Mother: US, Father: Hawaii",
        "M": "Mother: US, Father: Other",
        "S": "Mother: Hawaii, Father: Unknown",
        "7": "Mother: Hawaii, Father: Japan",
        "8": "Mother: Hawaii, Father: US",
        "9": "Mother: Hawaii, Father: Hawaii",
        "V": "Mother: Hawaii, Father: Other",
        "X": "Mother: Other, Father: Unknown",
        "C": "Mother: Other, Father: Japan",
        "L": "Mother: Other, Father: US",
        "U": "Mother: Other, Father: Hawaii",
        "Z": "Mother: Other, Father: Other"
        
    }
    
    # described on page 57 
    # https://s3.amazonaws.com/NARAprodstorage/lz/electronic-records/rg-210/wra/102.1DP.pdf
    # https://catalog.archives.gov/id/1264228
    
    sex_marital_status_dict = {
        "1": "Male single",
        "2": "Male married",
        "3": "Male widowed",
        "4": "Male divorced",
        "5": "Male separated",
        "o": "Male unknown",
        "6": "Female single",
        "7": "Female married",
        "8": "Female widowed",
        "9": "Female divorced",
        "PI": "Female separated",
        "_o_": "Female unknown"
    }
    
    
    # described on page 54 
    # https://s3.amazonaws.com/NARAprodstorage/lz/electronic-records/rg-210/wra/102.1DP.pdf
    # https://catalog.archives.gov/id/1264228
    
    fathers_occupation_in_us_dict = {
        "1": "Professional & semiprofessional",
        "2": "Managerial and official (except farm)",
        "3": "Clerical and sales",
        "4": "Service",
        "5": "Farm operators and managers",
        "6": "Farm laborers including foremen",
        "7": "Fishermen",
        "8": "Skilled craftsmen and foremen; Semi-skilled operators (except farm)",
        "9": "Unskilled laborers (except farm)",
        "PI": "Unrevised forms (not on schedule)",
        "_o_": "Blank, unkown, none dash"
    }
    
    # described on page 54 
    # https://s3.amazonaws.com/NARAprodstorage/lz/electronic-records/rg-210/wra/102.1DP.pdf
    # https://catalog.archives.gov/id/1264228
    
    total_years_schooling_in_japan_dict = {
        "o": "None", 
        "_o_": "Unknown",
        "1": "1 year",
        "2": "2 years",
        "3": "3 years",
        "4": "4 years",
        "5": "5 years",
        "6": "6 years",
        "7": "7 years",
        "8": "8 years",
        "9": "9 years",
        "A": "10 years",
        "B": "11 years",
        "C": "12 years",
        "D": "13 years",
        "E": "14 years",
        "F": "15 years",
        "G": "16 years",
        "H": "17 years",
        "I": "18 years",
        "J": "19 years",
        "K": "20 years"
    }
    
    # described on page 60 
    # https://s3.amazonaws.com/NARAprodstorage/lz/electronic-records/rg-210/wra/102.1DP.pdf
    # https://catalog.archives.gov/id/1264228
    
    language_dict = {
        "1": "Only Japanese-speak",
        "2": "Only Japanese-speak, read and write",
        "3": "Japanese-speak; English-speak",
        "4": "Japanese-speak, read and write; English-speak",
        "5": "Only English-speak, read and write",
        "6": "Japanese-speak; English-speak, read and write",
        "7": "Japanese-speak, read and write; English-speak, read and write",
        "A": "Female married",
        "B": "Female widowed",
        "C": "Female divorced",
        "D": "Female divorced",
        "E": "Japanese-speak, read and write; Other-speak",
        "F": "Japanese-speak, read and write; Other read and write",
        "G": "Japanese-speak, read and write; Other-speak, read and write",
        "H": "Other-speak; English-speak",
        "I": "Japanese-speak, read and write; English read and write",
        "J": "Other-speak;  English-speak, read and write",
        "L": "Other read and write; English-speak, read and write",
        "M": "Other-speak, read and write; English-speak",
        "N": "Other-speak, read and write; English-speak, read and write",
        "O": "Japanese-speak; English-speak, read and write; Other-speak",
        "P": "Japanese-speak, read and write; English read and write; Other-speak",
        "K": "Japanese-speak, read and write; English read and write; Other-speak, read and write",
        "Q": "Japanese-speak; English-speak, read and write; Other read and write",
        "R": "Japanese-speak; English-speak, read and write; Other-speak, read and write",
        "S": "Japanese-speak, read and write; English-speak; Other-speak",
        "U": "Japanese-speak, read and write; English-speak; Other-read and write",
        "V": "Japanese-speak, read and write; English-speak; Other-speak, read and write",
        "W": "Japanese-speak, read and write; English-speak, read and write; Other-speak",
        "Y": "Japanese-speak, read and write; English-speak, read and write; Other read and write",
        "Z": "Japanese-speak, read and write; English-speak, read and write; Other-speak, read and write",
        "PI": "Only English-speak",
        "_o_": "Unknown"
    }
    
    # described on page 58 
    # https://s3.amazonaws.com/NARAprodstorage/lz/electronic-records/rg-210/wra/102.1DP.pdf
    # https://catalog.archives.gov/id/1264228
    
    race_individual_and_spouse_dict = {
        "4": "Individual: J, No spouse",
        "5": "Individual: J&W, No spouse",
        "6": "Individual: J&O, No spouse",
        "8": "Individual: W, No spouse",
        "17": "Individual: O, No spouse",
        "7": "Individual: J, Spouse: J",
        "J": "Individual: J&W, Spouse: J",
        "K": "Individual: J&O, Spouse: J",
        "S": "Individual: W, Spouse: J",
        "1": "Individual: O, Spouse: J",
        "2": "Individual: W&O, Spouse: J",
        "L": "Individual: J, Spouse: J&W",
        "M": "Individual: J&W, Spouse: J&W",
        "N": "Individual: J&O, Spouse: J&W",
        "T": "Individual: W, Spouse: J&W",
        "12": "Individual: O, Spouse: J&W",
        "23": "Individual: W&O, Spouse: J&W",
        "O": "Individual: J, Spouse: J&O",
        "P": "Individual: J&W, Spouse: J&O",
        "Q": "Individual: J&O, Spouse: J&O",
        "U": "Individual: W, Spouse: J&O",
        "13": "Individual: O, Spouse: J&O",
        "24": "Individual: W&O, Spouse: J&O",
        "V": "Individual: J, Spouse: W",
        "W": "Individual: J&W, Spouse: W",
        "X": "Individual: J&O, Spouse: W",
        "14": "Individual: J, Spouse: O",
        "15": "Individual: J&W, Spouse: O",
        "16": "Individual: J&O, Spouse: O",
        "25": "Individual: J, Spouse: W&O",
        "26": "Individual: J&W, Spouse: W&O",
        "27": "Individual: J&O, Spouse: W&O",
        "3": "Individual: J, Spouse: Unknown",
        "34": "Individual: J&W, Spouse: Unknown",
        "35": "Individual: J&O, Spouse: Unknown",
        "36": "Individual: W, Spouse: Unknown",
        "37": "Individual: O, Spouse: Unknown",
        "38": "Individual: W&O, Spouse: Unknown"
    }
    
    address_dict = {
        
    }
    
    
    MAX_WIDTH = 80
    
    # Sort the FIELD_SPECS by the start index (second element in the tuple)
    sorted_field_specs = sorted(FIELD_SPECS, key=lambda x: x[1])


    
    # Build colspecs and names for pd.read_fwf, replacing None with MAX_WIDTH
    colspecs = [(start, end if end is not None else MAX_WIDTH) for (_, start, end) in sorted_field_specs]
    names = [name for (name, _, _) in sorted_field_specs]

    
    df = pd.read_fwf(file_path, colspecs=colspecs, header=None, names=names, dtype=str, na_filter=False)
    
    # Strip whitespace from string columns
    df = df.applymap(lambda x: x.strip())
    
    # # Apply dict for place of birth...
    df["place_of_birth_lookup"] = df["country_of_birth"].map(place_of_birth_dict)
    
    # # Apply dict for place of birth...
    df["relocation_center_lookup"] = df["relocation_center"].map(relocation_center_dict)
    
    # Apply dict for assembly center
    df["assembly_center_lookup"] = df["assembly_center"].map(assembly_center_dict)
    
    # Apply dict for country_of_birth
    df["country_of_birth_lookup"] = df["country_of_birth"].map(country_of_birth_dict)
    
    # Apply dict for race_individual_and_spouse
    df["race_individual_and_spouse_lookup"] = df["race_individual_and_spouse"].map(race_individual_and_spouse_dict)
    
    # Apply dict for language 
    df["language_lookup"] = df["language"].map(language_dict)
    
    # Apply dict for sex_marital_status 
    df["sex_marital_status_lookup"] = df["sex_marital_status"].map(sex_marital_status_dict)

    # this will grab the 'family' id 
    df["file_number_base"] = df["file_number"].str[:5]

    # look up the previous address state and county in the dictionary
    df["previous_address_state_county"] = (df["previous_address"].str[:4].map(address_dict)         
)
    
    return df



In [42]:

    file_path = "RG210.JAPAN.WRA26.txt"  # Replace with your data file name/path

    df = parse_with_pandas(file_path)
    print(df)

         last_name first_name middle_initial relocation_center  \
0       KAHASHI TA   KEO    4              0                 1   
1       ZUKI    CH   IEKO   0              0                 7   
2       A AS IMA M        SAO              5                     
3       AKASA    A       KIRA              3                 B   
4       ATO      S   HIGEKO J              1                 1   
...            ...        ...            ...               ...   
109371     9MAMOTO   YOSHINOR                                1   
109372     9NATOMI    CHARLES              T                 3   
109373     9SHIOKA    SHIZUKO              P                 1   
109374        9WAI     YUSAKO              L                 1   
109375     9WASAKI      AGNES              S                 4   

       assembly_center previous_address country_of_birth  \
0                    1            21315                &   
1                    1            --011                &   
2                    1     

In [43]:
# Export to a CSV file
df.to_csv("processed.csv")