In [1]:
#Import pandas and sqlite3 modules
import pandas as pd
import sqlite3

In [2]:
#Create connections to the SQLite database
sqlite_database = ("database.sqlite")
connection = sqlite3.connect(sqlite_database)

In [3]:
#Retrieve data from Rate table
#Limit data to BusinessYear 2015 only (Otherwise a memory error will occur)
#Eclude Age with value of "family option" and individualrate with value of 9999 or 9999.99
rate_data_2015 = pd.read_sql("""SELECT Distinct 
                            BusinessYear, 
                            StateCode, 
                            PlanId, 
                            RatingAreaId, 
                            Tobacco, 
                            Age, 
                            IndividualRate, 
                            IndividualTobaccoRate,
                            Couple, 
                            PrimarySubscriberAndOneDependent, 
                            PrimarySubscriberAndTwoDependents, 
                            PrimarySubscriberAndThreeOrMoreDependents,
                            CoupleAndOneDependent, 
                            CoupleAndTwoDependents, 
                            CoupleAndThreeOrMoreDependents
                            FROM Rate
                            WHERE BusinessYear='2015' and 
                                  Age!='Family Option' and 
                                  IndividualRate!=9999 and 
                                  IndividualRate!=9999.99;""", connection)

In [4]:
#Display dimentionality of rate dataframe
rate_data_2015.shape

(4560319, 15)

In [5]:
#Try to browse the dataframe to see if some columns have null values
rate_data_2015.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4560319 entries, 0 to 4560318
Data columns (total 15 columns):
BusinessYear                                 int64
StateCode                                    object
PlanId                                       object
RatingAreaId                                 object
Tobacco                                      object
Age                                          object
IndividualRate                               float64
IndividualTobaccoRate                        object
Couple                                       object
PrimarySubscriberAndOneDependent             object
PrimarySubscriberAndTwoDependents            object
PrimarySubscriberAndThreeOrMoreDependents    object
CoupleAndOneDependent                        object
CoupleAndTwoDependents                       object
CoupleAndThreeOrMoreDependents               object
dtypes: float64(1), int64(1), object(13)
memory usage: 521.9+ MB


In [6]:
#Generates descriptive statistics for rate dataframe
rate_data_2015.describe()

Unnamed: 0,BusinessYear,IndividualRate
count,4560319.0,4560319.0
mean,2015.0,299.514
std,0.0,285.6261
min,2015.0,0.0
25%,2015.0,28.15
50%,2015.0,278.4
75%,2015.0,456.65
max,2015.0,2084.7


In [7]:
#Retrive data from PlanAttributes table
#Limit data to BusinessYear 2015 only
#Include individual cost limt for EHB
plan_data_2015 = pd.read_sql("""SELECT Distinct 
                            PlanId,
                            MetalLevel,
                            TEHBInnTier1IndividualMOOP,
                            TEHBInnTier2IndividualMOOP,
                            TEHBOutOfNetIndividualMOOP,
                            TEHBDedInnTier1Individual,
                            TEHBDedInnTier2Individual,
                            TEHBDedOutOfNetIndividual
                            FROM PlanAttributes
                            WHERE BusinessYear='2015';""", connection)

In [8]:
#Display dimentionality of plan dataframe
plan_data_2015.shape

(31253, 8)

In [9]:
#Try to browse the plan dataframe to see if some columns have null values
plan_data_2015.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31253 entries, 0 to 31252
Data columns (total 8 columns):
PlanId                        31253 non-null object
MetalLevel                    31253 non-null object
TEHBInnTier1IndividualMOOP    31253 non-null object
TEHBInnTier2IndividualMOOP    31253 non-null object
TEHBOutOfNetIndividualMOOP    31253 non-null object
TEHBDedInnTier1Individual     31253 non-null object
TEHBDedInnTier2Individual     31253 non-null object
TEHBDedOutOfNetIndividual     31253 non-null object
dtypes: object(8)
memory usage: 1.9+ MB


In [10]:
#Generates descriptive statistics for plan dataframe
plan_data_2015.describe()

Unnamed: 0,PlanId,MetalLevel,TEHBInnTier1IndividualMOOP,TEHBInnTier2IndividualMOOP,TEHBOutOfNetIndividualMOOP,TEHBDedInnTier1Individual,TEHBDedInnTier2Individual,TEHBDedOutOfNetIndividual
count,31253,31253,31253.0,31253.0,31253,31253.0,31253.0,31253.0
unique,31253,7,153.0,79.0,148,142.0,74.0,119.0
top,35065IN0030007-02,Silver,,,Not Applicable,,,
freq,1,13021,4506.0,29185.0,12559,15757.0,29778.0,15757.0


In [11]:
#Display the first 10 rows of Plan data
#Column 2 to 7 have numbers with $ as prefix. Needs to remove $ and change them to integers.
plan_data_2015.head(10)

Unnamed: 0,PlanId,MetalLevel,TEHBInnTier1IndividualMOOP,TEHBInnTier2IndividualMOOP,TEHBOutOfNetIndividualMOOP,TEHBDedInnTier1Individual,TEHBDedInnTier2Individual,TEHBDedOutOfNetIndividual
0,21989AK0030001-00,High,,,,,,
1,21989AK0030001-01,High,,,,,,
2,38344AK1020001-00,High,,,,,,
3,38344AK1020001-01,High,,,,,,
4,73836AK0750002-01,Gold,"$4,750",,"$9,500",$750,,"$1,500"
5,73836AK0710006-01,Silver,"$6,600",,"$13,200","$2,000",,"$4,000"
6,73836AK0710004-00,Silver,"$6,600",,"$13,200","$4,000",,"$8,000"
7,73836AK0750003-06,Silver,$500,,"$13,200",$100,,"$2,700"
8,73836AK0750004-00,Silver,"$6,600",,"$13,200","$1,350",,"$2,700"
9,73836AK0710004-01,Silver,"$6,600",,"$13,200","$4,000",,"$8,000"


In [12]:
#Remove $ sign for column TEHBInnTier1IndividualMOOP
plan_data_2015.TEHBInnTier1IndividualMOOP = [x.strip('$') for x in plan_data_2015.TEHBInnTier1IndividualMOOP]
plan_data_2015.TEHBInnTier1IndividualMOOP.head(10)

0         
1         
2         
3         
4    4,750
5    6,600
6    6,600
7      500
8    6,600
9    6,600
Name: TEHBInnTier1IndividualMOOP, dtype: object

In [21]:
#Remove , in the numbers for column TEHBInnTier1IndividualMOOP
plan_data_2015.TEHBInnTier1IndividualMOOP= [x.replace(',' , '') for x in plan_data_2015.TEHBInnTier1IndividualMOOP]
plan_data_2015.TEHBInnTier1IndividualMOOP.head(10)

0        
1        
2        
3        
4    4750
5    6600
6    6600
7     500
8    6600
9    6600
Name: TEHBInnTier1IndividualMOOP, dtype: object