# import libraries

In [None]:
import os
import joblib

import numpy as np
import pandas as pd
import seaborn as sns
# import sweetviz as sw
import missingno as msno
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor

# import shap
import optuna
import optuna.visualization as vis
# import category_encoders as ce

import xgboost as xgb
import lightgbm as lgb
import catboost as cat

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

# get data

In [None]:
# !unzip -q '/content/data.zip'

In [None]:
TRAIN_PATH = '/content/Train_data.csv'
TEST_PATH = '/content/Test_data.csv'
SAMP_SUB_PATH = '/content/Sample_submission.csv'

In [None]:
raw_train = pd.read_csv(TRAIN_PATH)
raw_test = pd.read_csv(TEST_PATH)
raw_sub = pd.read_csv(SAMP_SUB_PATH)

In [None]:
print(raw_train.info())
raw_train.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52345 entries, 0 to 52344
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   YearStart                  52345 non-null  int64  
 1   YearEnd                    52345 non-null  int64  
 2   LocationAbbr               52345 non-null  object 
 3   LocationDesc               52345 non-null  object 
 4   Datasource                 52345 non-null  object 
 5   Topic                      52345 non-null  object 
 6   Question                   52345 non-null  object 
 7   Data_Value_Type            52345 non-null  object 
 8   Data_Value                 52345 non-null  float64
 9   Low_Confidence_Limit       52345 non-null  float64
 10  High_Confidence_Limit      52345 non-null  float64
 11  Sample_Size                52345 non-null  float64
 12  Total                      1801 non-null   object 
 13  Age(years)                 10815 non-null  obj

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Topic,Question,Data_Value_Type,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,Total,Age(years),Education,Gender,Income,Race/Ethnicity,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1,Class
23757,2012,2012,WV,North Carolina,Behavioral Risk Factor Surveillance System,Fruits and Vegetables - Behavior,Percent of adults who report consuming fruit l...,Value,37.874117,35.074117,40.774117,1483.265899,,,,,,,"(39.360700171000474, -111.58713063499971)",FV,FV1,Q018,VALUE,54.0,Income,College graduate,INC,EDUCOGRAD,Fruits and Vegetables
27503,2013,2013,CT,Connecticut,Behavioral Risk Factor Surveillance System,Fruits and Vegetables - Behavior,Percent of adults who report consuming fruit l...,Value,36.2,32.7,39.8,1320.0,,45 - 54,,,,,"(41.56266102000046, -72.64984095199964)",FV,FV1,Q018,VALUE,9.0,Age (years),45 - 54,AGEYR,AGEYR4554,Fruits and Vegetables


In [None]:
print(raw_test.info())
raw_test.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4636 entries, 0 to 4635
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   YearStart                  4636 non-null   int64  
 1   YearEnd                    4636 non-null   int64  
 2   LocationAbbr               4636 non-null   object 
 3   LocationDesc               4636 non-null   object 
 4   Datasource                 4636 non-null   object 
 5   Topic                      4636 non-null   object 
 6   Question                   4636 non-null   object 
 7   Data_Value_Type            4636 non-null   object 
 8   Low_Confidence_Limit       4636 non-null   float64
 9   High_Confidence_Limit      4636 non-null   float64
 10  Sample_Size                4636 non-null   float64
 11  Total                      193 non-null    object 
 12  Age(years)                 1088 non-null   object 
 13  Education                  729 non-null    objec

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Topic,Question,Data_Value_Type,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,Total,Age(years),Education,Gender,Income,Race/Ethnicity,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1,Class
717,2016,2016,MO,Missouri,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,Value,35.5,42.9,1444.0,,,,,"$75,000 or greater",,"(38.635790776000476, -92.56630005299968)",OWS,OWS1,Q037,VALUE,29.0,Income,"$75,000 or greater",INC,INC75PLUS,Obesity / Weight Status
4051,2016,2016,WV,West Virginia,Behavioral Risk Factor Surveillance System,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,Value,31.0,37.2,1214.0,,45 - 54,,,,,"(38.66551020200046, -80.71264013499967)",PA,PA1,Q047,VALUE,54.0,Age (years),45 - 54,AGEYR,AGEYR4554,Physical Activity


# eda

In [None]:
eda = raw_train.copy()
edat = raw_test.copy()

## sweetviz

In [None]:
basic_eda = sw.compare([eda, 'Train Data'], [edat, 'Test Data'], target_feat='Data_Value')
basic_eda.show_notebook()

# data wrangling / data viz

In [None]:
useless_cols = ['YearStart', 'YearEnd', 'Datasource', 'Data_Value_Type', 'Total', 'DataValueTypeID']
eda = eda.drop(useless_cols, axis=1)
edat = edat.drop(useless_cols, axis=1)

In [None]:
eda['Topic'].value_counts()

Fruits and Vegetables - Behavior    19464
Obesity / Weight Status             16957
Physical Activity - Behavior        15924
Name: Topic, dtype: int64

In [None]:
mask = eda['Topic'] != 'Fruits and Vegetables - Behavior'
eda.loc[mask, 'Question'].unique()

array(['Percent of adults aged 18 years and older who have obesity',
       'Percent of adults aged 18 years and older who have an overweight classification',
       'Percent of adults who engage in muscle-strengthening activities on 2 or more days a week',
       'Percent of adults who achieve at least 150 minutes a week of moderate-intensity aerobic physical activity or 75 minutes a week of vigorous-intensity aerobic activity (or an equivalent combination)',
       'Percent of adults who achieve at least 150 minutes a week of moderate-intensity aerobic physical activity or 75 minutes a week of vigorous-intensity aerobic physical activity and engage in muscle-strengthening activities on 2 or more days a week',
       'Percent of adults who achieve at least 300 minutes a week of moderate-intensity aerobic physical activity or 150 minutes a week of vigorous-intensity aerobic activity (or an equivalent combination)',
       'Percent of adults who engage in no leisure-time physical activi

# preprocess

## drop useless columns

In [None]:
useless_cols = ['YearStart', 'YearEnd', 'Datasource', 'Data_Value_Type', 'DataValueTypeID', 'Total']
train = raw_train.drop(useless_cols, axis=1)
test = raw_test.drop(useless_cols, axis=1)

## topic

In [None]:
mask = train['Topic'] != 'Fruits and Vegetables - Behavior'
train = train[mask]

In [None]:
mask = test['Topic'] != 'Fruits and Vegetables - Behavior'
test = test[mask]

In [None]:
high_corr_cols = ['Class', 'ClassID', 'TopicID', 'QuestionID',]
train = train.drop(high_corr_cols, axis=1)
test = test.drop(high_corr_cols, axis=1)

In [None]:
print(train.info())
train.sample()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32881 entries, 0 to 49785
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   LocationAbbr               32881 non-null  object 
 1   LocationDesc               32881 non-null  object 
 2   Topic                      32881 non-null  object 
 3   Question                   32881 non-null  object 
 4   Data_Value                 32881 non-null  float64
 5   Low_Confidence_Limit       32881 non-null  float64
 6   High_Confidence_Limit      32881 non-null  float64
 7   Sample_Size                32881 non-null  float64
 8   Age(years)                 7567 non-null   object 
 9   Education                  5019 non-null   object 
 10  Gender                     2460 non-null   object 
 11  Income                     8853 non-null   object 
 12  Race/Ethnicity             6872 non-null   object 
 13  GeoLocation                32206 non-null  obj

Unnamed: 0,LocationAbbr,LocationDesc,Topic,Question,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,Age(years),Education,Gender,Income,Race/Ethnicity,GeoLocation,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
28509,GA,Georgia,Physical Activity - Behavior,Percent of adults who achieve at least 150 min...,33.9,28.7,39.5,432.0,18 - 24,,,,,"(32.83968109300048, -83.62758034599966)",13.0,Age (years),18 - 24,AGEYR,AGEYR1824


## question

In [None]:
useful_questions = [
    'Percent of adults aged 18 years and older who have obesity',
    'Percent of adults aged 18 years and older who have an overweight classification',
    'Percent of adults who engage in no leisure-time physical activity',
]

In [None]:
mask = train['Question'].isin(useful_questions)
train.loc[~mask, 'Question'] = 'Other Questions'

In [None]:
mask = test['Question'].isin(useful_questions)
test.loc[~mask, 'Question'] = 'Other Questions'

In [None]:
train.groupby(['Topic', 'Question']).agg(['mean', 'count'])['Data_Value']

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
Topic,Question,Unnamed: 2_level_1,Unnamed: 3_level_1
Obesity / Weight Status,Percent of adults aged 18 years and older who have an overweight classification,34.867394,8671
Obesity / Weight Status,Percent of adults aged 18 years and older who have obesity,28.581989,8286
Physical Activity - Behavior,Other Questions,32.723206,10562
Physical Activity - Behavior,Percent of adults who engage in no leisure-time physical activity,25.355781,5362


In [None]:
high_corr_cols = ['Topic']
train = train.drop(high_corr_cols, axis=1)
test = test.drop(high_corr_cols, axis=1)

In [None]:
print(train.info())
train.sample()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32881 entries, 0 to 49785
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   LocationAbbr               32881 non-null  object 
 1   LocationDesc               32881 non-null  object 
 2   Question                   32881 non-null  object 
 3   Data_Value                 32881 non-null  float64
 4   Low_Confidence_Limit       32881 non-null  float64
 5   High_Confidence_Limit      32881 non-null  float64
 6   Sample_Size                32881 non-null  float64
 7   Age(years)                 7567 non-null   object 
 8   Education                  5019 non-null   object 
 9   Gender                     2460 non-null   object 
 10  Income                     8853 non-null   object 
 11  Race/Ethnicity             6872 non-null   object 
 12  GeoLocation                32206 non-null  object 
 13  LocationID                 32881 non-null  flo

Unnamed: 0,LocationAbbr,LocationDesc,Question,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,Age(years),Education,Gender,Income,Race/Ethnicity,GeoLocation,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
38348,DC,District of Columbia,Percent of adults aged 18 years and older who ...,23.202683,17.302683,30.285059,414.647507,,,,"$50,000 - $74,999",,"(38.89037138500049, -77.03196112699965)",10.911877,Income,"Less than $15,000",INC,INCLESS15


## locationAbbr

In [None]:
high_corr_cols = ['LocationDesc', 'LocationID', 'GeoLocation']
train = train.drop(high_corr_cols, axis=1)
test = test.drop(high_corr_cols, axis=1)

In [None]:
print(train.info())
train.sample()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32881 entries, 0 to 49785
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   LocationAbbr               32881 non-null  object 
 1   Question                   32881 non-null  object 
 2   Data_Value                 32881 non-null  float64
 3   Low_Confidence_Limit       32881 non-null  float64
 4   High_Confidence_Limit      32881 non-null  float64
 5   Sample_Size                32881 non-null  float64
 6   Age(years)                 7567 non-null   object 
 7   Education                  5019 non-null   object 
 8   Gender                     2460 non-null   object 
 9   Income                     8853 non-null   object 
 10  Race/Ethnicity             6872 non-null   object 
 11  StratificationCategory1    32881 non-null  object 
 12  Stratification1            32881 non-null  object 
 13  StratificationCategoryId1  32881 non-null  obj

Unnamed: 0,LocationAbbr,Question,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,Age(years),Education,Gender,Income,Race/Ethnicity,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
12939,SC,Percent of adults aged 18 years and older who ...,35.64327,31.672503,39.885384,1287.567296,,,,"$25,000 - $34,999",,Income,"$25,000 - $34,999",INC,INC2535


## stratification1

In [None]:
high_corr_cols = ['StratificationCategoryId1', 'StratificationID1', 'Gender', 'Education', 'Age(years)', 'Income', 'Race/Ethnicity', 'StratificationCategory1']
train = train.drop(high_corr_cols, axis=1)
test = test.drop(high_corr_cols, axis=1)

In [None]:
print(train.info())
train.sample()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32881 entries, 0 to 49785
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   LocationAbbr            32881 non-null  object 
 1   Question                32881 non-null  object 
 2   Data_Value              32881 non-null  float64
 3   Low_Confidence_Limit    32881 non-null  float64
 4   High_Confidence_Limit   32881 non-null  float64
 5   Sample_Size             32881 non-null  float64
 6   Stratification1         32881 non-null  object 
dtypes: float64(4), object(3)
memory usage: 3.3+ MB
None


Unnamed: 0,LocationAbbr,Question,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,Stratification1
39116,MD,Percent of adults aged 18 years and older who ...,22.794558,17.580952,29.013606,345.809522,Asian


In [None]:
print(test.info())
test.sample()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4636 entries, 0 to 4635
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   LocationAbbr            4636 non-null   object 
 1   Question                4636 non-null   object 
 2   Low_Confidence_Limit    4636 non-null   float64
 3   High_Confidence_Limit   4636 non-null   float64
 4   Sample_Size             4636 non-null   float64
 5   Stratification1         4636 non-null   object 
dtypes: float64(3), object(3)
memory usage: 253.5+ KB
None


Unnamed: 0,LocationAbbr,Question,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,Stratification1
1900,KY,Percent of adults aged 18 years and older who ...,32.3,40.4,1058.0,"Less than $15,000"


# playground

In [None]:
sample = train[train[['Gender', 'Education', 'Age(years)', 'Income', 'Race/Ethnicity']].isna().all(axis=1)].sample(5)
sample

Unnamed: 0,LocationAbbr,Question,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,Age(years),Education,Gender,Income,Race/Ethnicity,StratificationCategory1,Stratification1
12778,IN,Percent of adults aged 18 years and older who ...,35.636105,33.487861,37.784348,5577.051132,,,,,,Education,Female
39231,AK,Percent of adults aged 18 years and older who ...,38.430377,35.482259,41.40887,2412.467777,,,,,,Education,Non-Hispanic White
22444,GA,Percent of adults aged 18 years and older who ...,38.132385,34.526988,41.888682,1108.508996,,,,,,Income,35 - 44
35230,PR,Percent of adults aged 18 years and older who ...,38.7,37.0,40.3,5762.0,,,,,,Total,Total
21928,NE,Percent of adults aged 18 years and older who ...,30.134008,28.335585,32.013062,5446.774771,,,,,,Education,High school graduate


In [None]:
# train['Stratification1'].value_counts()

Total                               1367
Some college or technical school    1367
Female                              1337
High school graduate                1333
Data not reported                   1316
55 - 64                             1309
Less than $15,000                   1299
Male                                1295
Non-Hispanic White                  1292
45 - 54                             1288
65 or older                         1287
Less than high school               1286
35 - 44                             1285
$35,000 - $49,999                   1285
18 - 24                             1284
$50,000 - $74,999                   1280
$25,000 - $34,999                   1278
$15,000 - $24,999                   1278
Hispanic                            1261
25 - 34                             1256
$75,000 or greater                  1211
College graduate                    1211
2 or more races                     1191
Non-Hispanic Black                  1044
American Indian/

In [None]:
mapp = { col: train[col].dropna().unique().tolist() for col in ['Gender', 'Education', 'Age(years)', 'Income',
       'Race/Ethnicity'] }

In [None]:
mapp

{'Age(years)': ['18 - 24',
  '25 - 34',
  '35 - 44',
  '45 - 54',
  '55 - 64',
  '65 or older'],
 'Education': ['Less than high school',
  'High school graduate',
  'Some college or technical school',
  'College graduate'],
 'Gender': ['Male', 'Female'],
 'Income': ['Less than $15,000',
  '$15,000 - $24,999',
  '$25,000 - $34,999',
  '$35,000 - $49,999',
  '$50,000 - $74,999',
  '$75,000 or greater',
  'Data not reported'],
 'Race/Ethnicity': ['Non-Hispanic White',
  'Non-Hispanic Black',
  'Hispanic',
  'American Indian/Alaska Native',
  '2 or more races',
  'Other',
  'Asian',
  'Hawaiian/Pacific Islander'],
 'Total': ['Total']}

In [None]:
new_mapp = { }
for k, v in mapp.items():
    for val in v:
        new_mapp[val] = k
new_mapp

{'$15,000 - $24,999': 'Income',
 '$25,000 - $34,999': 'Income',
 '$35,000 - $49,999': 'Income',
 '$50,000 - $74,999': 'Income',
 '$75,000 or greater': 'Income',
 '18 - 24': 'Age(years)',
 '2 or more races': 'Race/Ethnicity',
 '25 - 34': 'Age(years)',
 '35 - 44': 'Age(years)',
 '45 - 54': 'Age(years)',
 '55 - 64': 'Age(years)',
 '65 or older': 'Age(years)',
 'American Indian/Alaska Native': 'Race/Ethnicity',
 'Asian': 'Race/Ethnicity',
 'College graduate': 'Education',
 'Data not reported': 'Income',
 'Female': 'Gender',
 'Hawaiian/Pacific Islander': 'Race/Ethnicity',
 'High school graduate': 'Education',
 'Hispanic': 'Race/Ethnicity',
 'Less than $15,000': 'Income',
 'Less than high school': 'Education',
 'Male': 'Gender',
 'Non-Hispanic Black': 'Race/Ethnicity',
 'Non-Hispanic White': 'Race/Ethnicity',
 'Other': 'Race/Ethnicity',
 'Some college or technical school': 'Education',
 'Total': 'Total'}

In [None]:
for i, row in tqdm(train.iterrows(), total=train.shape[0]):
    val = row['Stratification1']
    # print(i, val)
    if val == 'Total':
        continue

    col_name = new_mapp[val]
    row[col_name] = val
    
    train.loc[i] = row

  0%|          | 0/32881 [00:00<?, ?it/s]

In [None]:
train = train.fillna('no info')

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32881 entries, 0 to 49785
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   LocationAbbr             32881 non-null  object 
 1   Question                 32881 non-null  object 
 2   Data_Value               32881 non-null  float64
 3   Low_Confidence_Limit     32881 non-null  float64
 4   High_Confidence_Limit    32881 non-null  float64
 5   Sample_Size              32881 non-null  float64
 6   Age(years)               32881 non-null  object 
 7   Education                32881 non-null  object 
 8   Gender                   32881 non-null  object 
 9   Income                   32881 non-null  object 
 10  Race/Ethnicity           32881 non-null  object 
 11  StratificationCategory1  32881 non-null  object 
 12  Stratification1          32881 non-null  object 
dtypes: float64(4), object(9)
memory usage: 4.8+ MB


In [None]:
high_corr_cols = ['StratificationCategory1', 'Stratification1']
train = train.drop(high_corr_cols, axis=1)
test = test.drop(high_corr_cols, axis=1)

Unnamed: 0,LocationAbbr,Question,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,Age(years),Education,Gender,Income,Race/Ethnicity,StratificationCategory1,Stratification1
12778,IN,Percent of adults aged 18 years and older who ...,35.636105,33.487861,37.784348,5577.051132,,,Female,,,Education,Female
39231,AK,Percent of adults aged 18 years and older who ...,38.430377,35.482259,41.40887,2412.467777,,,,,Non-Hispanic White,Education,Non-Hispanic White
22444,GA,Percent of adults aged 18 years and older who ...,38.132385,34.526988,41.888682,1108.508996,35 - 44,,,,,Income,35 - 44
35230,PR,Percent of adults aged 18 years and older who ...,38.7,37.0,40.3,5762.0,,,,,,Total,Total
21928,NE,Percent of adults aged 18 years and older who ...,30.134008,28.335585,32.013062,5446.774771,,High school graduate,,,,Education,High school graduate


In [None]:
mapp.keys()

dict_keys(['Gender', 'Education', 'Age(years)', 'Income', 'Race/Ethnicity', 'Total'])

In [None]:
for k, v in mapp.items():
    print(k, v, len(v))

Gender ['Male', 'Female'] 2
Education ['Less than high school', 'High school graduate', 'Some college or technical school', 'College graduate'] 4
Age(years) ['18 - 24', '25 - 34', '35 - 44', '45 - 54', '55 - 64', '65 or older'] 6
Income ['Less than $15,000', '$15,000 - $24,999', '$25,000 - $34,999', '$35,000 - $49,999', '$50,000 - $74,999', '$75,000 or greater', 'Data not reported'] 7
Race/Ethnicity ['Non-Hispanic White', 'Non-Hispanic Black', 'Hispanic', 'American Indian/Alaska Native', '2 or more races', 'Other', 'Asian', 'Hawaiian/Pacific Islander'] 8
Total ['Total'] 1


In [None]:
train.iloc[0]['Gender']

nan

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32881 entries, 0 to 49785
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   LocationAbbr             32881 non-null  object 
 1   Question                 32881 non-null  object 
 2   Data_Value               32881 non-null  float64
 3   Low_Confidence_Limit     32881 non-null  float64
 4   High_Confidence_Limit    32881 non-null  float64
 5   Sample_Size              32881 non-null  float64
 6   Age(years)               7567 non-null   object 
 7   Education                5019 non-null   object 
 8   Gender                   2460 non-null   object 
 9   Income                   8853 non-null   object 
 10  Race/Ethnicity           6872 non-null   object 
 11  StratificationCategory1  32881 non-null  object 
 12  Stratification1          32881 non-null  object 
dtypes: float64(4), object(9)
memory usage: 4.8+ MB


In [None]:
train.Stratification1.value_counts()

Total                               1367
Some college or technical school    1367
Female                              1337
High school graduate                1333
Data not reported                   1316
55 - 64                             1309
Less than $15,000                   1299
Male                                1295
Non-Hispanic White                  1292
45 - 54                             1288
65 or older                         1287
Less than high school               1286
35 - 44                             1285
$35,000 - $49,999                   1285
18 - 24                             1284
$50,000 - $74,999                   1280
$25,000 - $34,999                   1278
$15,000 - $24,999                   1278
Hispanic                            1261
25 - 34                             1256
$75,000 or greater                  1211
College graduate                    1211
2 or more races                     1191
Non-Hispanic Black                  1044
American Indian/

In [None]:
train['StratificationCategory1'].value_counts()

Income            9008
Age (years)       7772
Race/Ethnicity    6962
Education         5204
Gender            2643
Total             1292
Name: StratificationCategory1, dtype: int64

# transformations

In [None]:
cat_cols = train.select_dtypes(include='object').columns.tolist()

for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

In [None]:
train['Sample_Size'] = np.log(train['Sample_Size'])
test['Sample_Size'] = np.log(test['Sample_Size'])

In [None]:
train['range'] = train['High_Confidence_Limit '] - train['Low_Confidence_Limit']
test['range'] = test['High_Confidence_Limit '] - test['Low_Confidence_Limit']

In [None]:
train['mid'] = (train['High_Confidence_Limit '] + train['Low_Confidence_Limit']) / 2
test['mid'] = (test['High_Confidence_Limit '] + test['Low_Confidence_Limit']) / 2

In [None]:
train.sample()

Unnamed: 0,LocationAbbr,Question,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size,Stratification1,range,mid
9613,42,0,30.9,26.0,36.3,6.33328,5,10.3,31.15


# xgboost

In [None]:
X = train.drop('Data_Value', axis=1)
y = train['Data_Value']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234, test_size=0.2)

In [None]:
# ?xgb.XGBRegressor

In [None]:
xgb_params = {
    'objective': 'reg:squarederror',
    'n_estimators': 1000,
    'max_depth': 6,
    'learning_rate': 0.2,
    'booster': 'gbtree',
    'n_jobs': -1,
    # 'gamma': 10,
    # 'min_child_weight': 0.4,
    # 'subsample': 0.9,
    # 'colsample_bytree': 0.1,
    # 'colsample_bylevel': 0.4,
    # 'colsample_bynode': 0.6,
    # 'reg_alpha': 10,
    # 'reg_lambda': 10,
    # 'scale_pos_weight': 4,
    # 'random_state': 1234,
    # 'importance_type': 'gain',
    # 'max_delta_step': ,
    # 'base_score': ,
    # 'verbosity': 1,
    # 'tree_method': 'auto',
    # 'missing': ,
    # 'num_parallel_tree': ,
    # 'monotone_constraints': ,
    # 'interaction_constraints': ,
}

In [None]:
xgb_model = xgb.XGBRegressor(**xgb_params)

In [None]:
xgb_model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=-1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [None]:
preds = xgb_model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, preds))

0.07971002670886919

In [None]:
scores = cross_val_score(xgb_model, X, y, cv=10, scoring='neg_mean_squared_error')

In [None]:
print(scores)
print(np.sqrt(np.abs(scores)).mean())

[-0.00674489 -0.00706129 -0.02470153 -0.00374974 -0.00341912 -0.00450549
 -0.00385324 -0.00482566 -0.0032765  -0.00592596]
0.07759195678681594


In [None]:
xgb_model.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.2, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=-1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [None]:
preds = xgb_model.predict(test)

In [None]:
print(preds.shape)
preds

(4636,)


array([35.200512, 41.07685 , 29.292925, ..., 29.998875, 44.08765 ,
       32.634926], dtype=float32)

In [None]:
sub = pd.DataFrame({
    'prediction': preds,
})
print(sub.shape)
sub.head()

(4636, 1)


Unnamed: 0,prediction
0,35.200512
1,41.076851
2,29.292925
3,34.620934
4,34.229858


In [None]:
SUB_FILEPATH = 'xgb1.csv'
sub.to_csv(SUB_FILEPATH, index=False)

In [None]:
BEST_PARAMS = {
    'objective': 'reg:squarederror',
    'booster': 'gbtree', 
    'n_jobs': -1
}
BEST_PARAMS.update(**xgb_study.best_params)
BEST_PARAMS    

{'booster': 'gbtree',
 'learning_rate': 0.24983078735166536,
 'max_depth': 4,
 'n_estimators': 3308,
 'n_jobs': -1,
 'objective': 'reg:squarederror'}

In [None]:
xgb_model = xgb.XGBRegressor(**BEST_PARAMS)

In [None]:
scores = cross_val_score(xgb_model, X, y, cv=10, scoring='neg_mean_squared_error')

In [None]:
print(scores)
print(np.sqrt(np.abs(scores)).mean())

[-0.00430918 -0.00305281 -0.01130288 -0.00265899 -0.00227472 -0.00291936
 -0.00267637 -0.00315468 -0.00239237 -0.00251734]
0.05874873288053556


In [None]:
xgb_model.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.24983078735166536, max_delta_step=0, max_depth=4,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=3308, n_jobs=-1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [None]:
preds = xgb_model.predict(test)

In [None]:
print(preds.shape)
preds

(4636,)


array([35.204628, 41.09936 , 29.284523, ..., 29.973993, 44.131493,
       32.592484], dtype=float32)

In [None]:
sub = pd.DataFrame({
    'prediction': preds,
})
print(sub.shape)
sub.head()

(4636, 1)


Unnamed: 0,prediction
0,35.204628
1,41.099361
2,29.284523
3,34.648819
4,34.233879


In [None]:
SUB_FILEPATH = 'xgb2.csv'
sub.to_csv(SUB_FILEPATH, index=False)