# STA 663 Project 1: 
- Huggingface: mastergopote44/Long-Term-Care-Aggregated-Data
- Name: Justin Kao
- School: Duke University

## Load dataset from SOA website

In [1]:
import requests
from io import BytesIO
from zipfile import ZipFile
import pandas as pd

# Directory to store the extracted files
desired_directory = '/Users/justinkao/Desktop/Durham NC/Duke University/Courses/S_2024_STA 663 Statistical Computing and Computation(Dr. Ouwen Huang)/Project 1/STA663_Project_1'  # Replace with your actual path

# URLs of the zip files
zip_files = {
    'incidence': 'https://www.soa.org/4a33aa/globalassets/assets/files/resources/experience-studies/2020/2000-2016-ltc-incidence.zip',
    'termination': 'https://www.soa.org/4a2e5d/globalassets/assets/files/resources/experience-studies/2020/2000-2016-ltc-termination.zip'
}

# DataFrames dictionary
dataframes = {}

# Download and unzip the files
for name, zip_url in zip_files.items():
    response = requests.get(zip_url)
    # Check if the request was successful
    if response.ok:
        with ZipFile(BytesIO(response.content)) as thezip:
            # Extract all files from the zip into a directory
            thezip.extractall(desired_directory)
            # Loop through each file in the zip
            for zipinfo in thezip.infolist():
                # Construct the full path to the extracted file
                file_path = f'{desired_directory}/{zipinfo.filename}'
                # Assuming the file is a tab-separated txt file
                df = pd.read_csv(file_path, sep='\t', low_memory=False)  # Adjust sep if necessary
                # Assign the DataFrame to the corresponding variable
                dataframes[name] = df
    else:
        print(f"Failed to retrieve {zip_url}")

In [13]:
incidence_df = dataframes['incidence']
incidence_df

Unnamed: 0,Group_Indicator,Gender,Issue_Age_Bucket,Incurred_Age_Bucket,Issue_Year_Bucket,Policy_Year,Marital_Status,Premium_Class,Underwriting_Type,Coverage_Type_Bucket,...,ALF_EP_Bucket,HHC_EP_Bucket,Region,Active_Exposure,Total_Exposure,Claim_Count,Count_NH,Count_ALF,Count_HHC,Count_Unk
0,Group,Female,55-59,55-59,2003-2005,1-3 years,Single,Standard,Other,Comprehensive,...,0,0,Unknown,37.583332,37.583332,0,0,0,0,0
1,Individual,Female,60-64,70-74,1997-1999,10-12 years,Single,Preferred,Other,Comprehensive,...,0,0,02: Northeast,14.500000,14.500000,0,0,0,0,0
2,Group,Female,50-54,60-64,2000-2002,10-12 years,Unknown,Standard,Unknown,Comprehensive,...,0,0,Unknown,2250.000000,2252.000000,0,0,0,0,0
3,Individual,Female,55-59,70-74,1994-1996,13-15 years,Married,Standard,Other,Comprehensive,...,0,0,01: Mid-West,19.000000,19.000000,0,0,0,0,0
4,Individual,Male,60-64,65-69,2003-2005,4-6 years,Married,Substandard,Full underwriting,Comprehensive,...,90,20,02: Northeast,2.000000,2.000000,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2089318,Individual,Male,55-59,60-64,2012-2014,1-3 years,Married,Substandard,Full underwriting,Comprehensive,...,0,0,01: Mid-West,7.083333,7.083333,0,0,0,0,0
2089319,Individual,Male,65-69,70-74,1997-1999,7-9 years,Married,Standard,Full underwriting,Comprehensive,...,180,180,02: Northeast,2.000000,2.000000,0,0,0,0,0
2089320,Individual,Female,65-69,75-79,1994-1996,7-9 years,Single,Preferred,Other,Comprehensive,...,0,0,01: Mid-West,6.000000,6.000000,0,0,0,0,0
2089321,Individual,Female,50-54,55-59,1994-1996,7-9 years,Single,Standard,Unknown,Comprehensive,...,0,0,Unknown,6.000000,6.000000,0,0,0,0,0


## Inspect for the presence of outliers and missing values in the variable

#### 1. Check out the dataframe at first and make sure the unique vlaue of each variable is reasonable 

In [25]:
termination_df = dataframes['termination']
termination_df

Unnamed: 0,Gender,Incurred_Age_Bucket,Incurred_Year_Bucket,Claim_Type,Region,Diagnosis_Category,Claim_Duration,Exposure,Deaths,Recovery,Terminations,Benefit_Expiry,Others_Terminations
0,Female,Unknown,2009-2010,HCC,Unknown,Unknown,80,77,0,2,2,1,0
1,Female,Unknown,2003-2004,NH,Unknown,Unknown,86,49,1,0,1,0,0
2,Female,Unknown,2005-2006,Other,Unknown,Unknown,88,16,1,0,1,0,0
3,Female,Unknown,2003-2004,ALF,Unknown,Unknown,114,29,1,0,1,0,0
4,Female,Unknown,< 2001,NH,Unknown,Unknown,4,56,5,1,6,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
627185,Male,85-89,2007-2008,NH,Unknown,05: Injury,33,2,0,0,0,0,0
627186,Male,75-79,2007-2008,HCC,03: South,Unknown,31,62,3,0,3,0,0
627187,Female,80-84,2013-2014,HCC,01: Mid-West,07: Stroke,23,6,0,0,0,0,0
627188,Male,70-74,2001-2002,HCC,02: Northeast,Unknown,30,7,0,0,0,0,0


In [24]:
unique_values = {col: incidence_df[col].unique() for col in incidence_df.columns}
unique_values

{'Group_Indicator': array(['Group', 'Individual'], dtype=object),
 'Gender': array(['Female', 'Male'], dtype=object),
 'Issue_Age_Bucket': array(['55-59', '60-64', '50-54', '65-69', '70-74', ' 0-49', '75-79',
        '80-84', '85-89', 'Unknown', '90+'], dtype=object),
 'Incurred_Age_Bucket': array(['55-59', '70-74', '60-64', '65-69', '50-54', '75-79', ' 0-49',
        '80-84', '85-89', '90+', 'Unknown'], dtype=object),
 'Issue_Year_Bucket': array(['2003-2005', '1997-1999', '2000-2002', '1994-1996', '2006-2008',
        '1991-1993', ' < 1991', '2012-2014', '2009-2011', '2015-2016',
        'Unknown'], dtype=object),
 'Policy_Year': array([' 1-3 years', '10-12 years', '13-15 years', ' 4-6 years',
        ' 7-9 years', '15+ years'], dtype=object),
 'Marital_Status': array(['Single', 'Unknown', 'Married'], dtype=object),
 'Premium_Class': array(['Standard', 'Preferred', 'Substandard'], dtype=object),
 'Underwriting_Type': array(['Other', 'Unknown', 'Full underwriting'], dtype=object),
 'Co

#### 2. Checking for "Unknown" Values That Do Not Contribute Information
The dataset contains 627,190 rows and 13 columns. For actuarial analysis, it is essential that the variables 'Issue_Age_Bucket', 'Incurred_Age_Bucket', and 'Issue_Year_Bucket' contain valid information since they are crucial in evaluating claim incidences. Rows with 'Unknown' values in these variables do not contribute to the analysis and can potentially skew the results. Currently, there are 519 'Unknown' values in 'Issue_Age_Bucket', 481 in 'Incurred_Age_Bucket', and 431 in 'Issue_Year_Bucket'. Removing rows with 'Unknown' values in these three variables is a necessary step to ensure data quality and reliability of the analysis.


In [26]:
# Calculate the number of 'Unknown' values in the 'Issue_Age_Bucket' column
unknown_issue_age = (incidence_df['Issue_Age_Bucket'] == 'Unknown').sum()
print(f"Number of 'Unknown' values in 'Issue_Age_Bucket': {unknown_issue_age}")

Number of 'Unknown' values in 'Issue_Age_Bucket': 519


In [22]:
unknown_incurred_age = (incidence_df['Incurred_Age_Bucket'] == 'Unknown').sum()
print(f"Number of 'Unknown' values in 'Incurred_Age_Bucket': {unknown_incurred_age}")


Number of 'Unknown' values in 'Incurred_Age_Bucket': 481


In [23]:
unknown_issue_year = (incidence_df['Issue_Year_Bucket'] == 'Unknown').sum()
print(f"Number of 'Unknown' values in 'Issue_Year_Bucket': {unknown_issue_year}")

Number of 'Unknown' values in 'Issue_Year_Bucket': 431


#### 3. Eliminate rows where below three variables have 'Unknown' values
- Counts of 'Unknown' values in key variables:
- 'Issue_Age_Bucket': 519
- 'Incurred_Age_Bucket': 481
- 'Issue_Year_Bucket': 431


In [28]:
# Eliminate rows where these three variables have 'Unknown' values
filtered_incidence_df = incidence_df[
    (incidence_df['Issue_Age_Bucket'] != 'Unknown') & 
    (incidence_df['Incurred_Age_Bucket'] != 'Unknown') & 
    (incidence_df['Issue_Year_Bucket'] != 'Unknown')
]

# Display the shape of the dataframe after elimination
print(f"Dataframe shape after elimination: {filtered_incidence_df.shape}")

Dataframe shape after elimination: (2088757, 31)


In [29]:
filtered_incidence_df

Unnamed: 0,Group_Indicator,Gender,Issue_Age_Bucket,Incurred_Age_Bucket,Issue_Year_Bucket,Policy_Year,Marital_Status,Premium_Class,Underwriting_Type,Coverage_Type_Bucket,...,ALF_EP_Bucket,HHC_EP_Bucket,Region,Active_Exposure,Total_Exposure,Claim_Count,Count_NH,Count_ALF,Count_HHC,Count_Unk
0,Group,Female,55-59,55-59,2003-2005,1-3 years,Single,Standard,Other,Comprehensive,...,0,0,Unknown,37.583332,37.583332,0,0,0,0,0
1,Individual,Female,60-64,70-74,1997-1999,10-12 years,Single,Preferred,Other,Comprehensive,...,0,0,02: Northeast,14.500000,14.500000,0,0,0,0,0
2,Group,Female,50-54,60-64,2000-2002,10-12 years,Unknown,Standard,Unknown,Comprehensive,...,0,0,Unknown,2250.000000,2252.000000,0,0,0,0,0
3,Individual,Female,55-59,70-74,1994-1996,13-15 years,Married,Standard,Other,Comprehensive,...,0,0,01: Mid-West,19.000000,19.000000,0,0,0,0,0
4,Individual,Male,60-64,65-69,2003-2005,4-6 years,Married,Substandard,Full underwriting,Comprehensive,...,90,20,02: Northeast,2.000000,2.000000,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2089318,Individual,Male,55-59,60-64,2012-2014,1-3 years,Married,Substandard,Full underwriting,Comprehensive,...,0,0,01: Mid-West,7.083333,7.083333,0,0,0,0,0
2089319,Individual,Male,65-69,70-74,1997-1999,7-9 years,Married,Standard,Full underwriting,Comprehensive,...,180,180,02: Northeast,2.000000,2.000000,0,0,0,0,0
2089320,Individual,Female,65-69,75-79,1994-1996,7-9 years,Single,Preferred,Other,Comprehensive,...,0,0,01: Mid-West,6.000000,6.000000,0,0,0,0,0
2089321,Individual,Female,50-54,55-59,1994-1996,7-9 years,Single,Standard,Unknown,Comprehensive,...,0,0,Unknown,6.000000,6.000000,0,0,0,0,0


# Split both "Incidence" and "Termination" datasets into train/test

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Now we will split the 'incidence' DataFrame into training and validation sets
train_incidence_df, validation_incidence_df = train_test_split(incidence_df, test_size=0.2, random_state=42)

# train_df is now your training set, and validation_df is your validation set.

In [7]:
train_incidence_df

Unnamed: 0,Group_Indicator,Gender,Issue_Age_Bucket,Incurred_Age_Bucket,Issue_Year_Bucket,Policy_Year,Marital_Status,Premium_Class,Underwriting_Type,Coverage_Type_Bucket,...,ALF_EP_Bucket,HHC_EP_Bucket,Region,Active_Exposure,Total_Exposure,Claim_Count,Count_NH,Count_ALF,Count_HHC,Count_Unk
1169689,Individual,Female,65-69,65-69,2012-2014,1-3 years,Married,Standard,Full underwriting,Comprehensive,...,180,20,03: South,3.00,3.00,0,0,0,0,0
1108052,Individual,Female,70-74,75-79,2003-2005,7-9 years,Unknown,Standard,Unknown,Other,...,0,0,04: West,3.00,3.00,0,0,0,0,0
592482,Individual,Male,0-49,0-49,2000-2002,1-3 years,Married,Standard,Other,Other,...,0,0,01: Mid-West,3.00,3.00,0,0,0,0,0
2018207,Individual,Female,60-64,65-69,1997-1999,7-9 years,Single,Standard,Full underwriting,Comprehensive,...,0,0,Unknown,15.00,15.00,0,0,0,0,0
914787,Individual,Female,50-54,50-54,2000-2002,4-6 years,Single,Preferred,Full underwriting,Comprehensive,...,0,0,01: Mid-West,27.00,27.00,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,Individual,Male,55-59,60-64,2009-2011,1-3 years,Married,Standard,Full underwriting,Comprehensive,...,90,90,02: Northeast,9.00,9.00,0,0,0,0,0
1414414,Group,Female,70-74,70-74,2000-2002,1-3 years,Married,Standard,Other,Comprehensive,...,30,30,04: West,3.00,3.00,0,0,0,0,0
131932,Individual,Female,65-69,75-79,1997-1999,10-12 years,Single,Preferred,Full underwriting,Comprehensive,...,0,0,01: Mid-West,19.25,19.25,1,1,0,0,0
671155,Individual,Male,70-74,75-79,2003-2005,4-6 years,Married,Substandard,Full underwriting,Other,...,0,0,04: West,3.00,3.00,0,0,0,0,0


In [8]:
validation_incidence_df

Unnamed: 0,Group_Indicator,Gender,Issue_Age_Bucket,Incurred_Age_Bucket,Issue_Year_Bucket,Policy_Year,Marital_Status,Premium_Class,Underwriting_Type,Coverage_Type_Bucket,...,ALF_EP_Bucket,HHC_EP_Bucket,Region,Active_Exposure,Total_Exposure,Claim_Count,Count_NH,Count_ALF,Count_HHC,Count_Unk
1105512,Individual,Female,50-54,60-64,2006-2008,7-9 years,Married,Preferred,Other,Comprehensive,...,0,0,Unknown,5.916666,5.916666,0,0,0,0,0
318931,Group,Male,65-69,75-79,1994-1996,7-9 years,Married,Standard,Other,Comprehensive,...,20,20,03: South,4.000000,4.000000,0,0,0,0,0
1624883,Individual,Male,60-64,70-74,1991-1993,7-9 years,Married,Standard,Full underwriting,Other,...,0,0,Unknown,263.416630,263.416630,0,0,0,0,0
1227544,Individual,Male,80-84,85-89,1997-1999,7-9 years,Single,Standard,Full underwriting,Comprehensive,...,0,0,01: Mid-West,3.000000,3.000000,0,0,0,0,0
179423,Group,Female,55-59,55-59,2000-2002,4-6 years,Single,Preferred,Other,Comprehensive,...,0,0,01: Mid-West,3.000000,3.000000,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278571,Individual,Female,55-59,65-69,2003-2005,10-12 years,Unknown,Preferred,Unknown,Comprehensive,...,90,30,01: Mid-West,2.000000,2.000000,0,0,0,0,0
1505247,Individual,Male,50-54,60-64,2000-2002,7-9 years,Single,Standard,Full underwriting,Comprehensive,...,0,0,01: Mid-West,8.000000,8.000000,0,0,0,0,0
1660318,Individual,Female,70-74,75-79,1997-1999,7-9 years,Single,Preferred,Other,Comprehensive,...,20,20,03: South,15.000000,15.000000,1,0,0,1,0
1563687,Group,Female,60-64,60-64,1997-1999,1-3 years,Unknown,Standard,Full underwriting,Comprehensive,...,0,0,04: West,3.999998,3.999998,0,0,0,0,0


In [5]:
# Now we will split the 'incidence' DataFrame into training and validation sets

train_termination_df, validation_termination_df = train_test_split(termination_df, test_size=0.2, random_state=42)