In [1]:
##------------------Import Libraries --------------------------##
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
##------------------Read Data----------------------------------##
file_path = 'SIPP_analysis_data.xlsx'
data = pd.read_excel(file_path)

In [3]:
data.head()

Unnamed: 0,unique_id,ssuid,repwgt0,repwgt1,repwgt2,repwgt3,repwgt4,repwgt5,repwgt6,repwgt7,...,unsecured_debt,unable_pay,mortgage_payment,asset_values,bachelors,years_since_movein,unemployed,out_lf,born_abroad,likely_retired
0,1,11428546521,7426.116541,3272.646846,2984.807011,4144.434128,3603.779767,12749.500402,3534.087316,3677.640142,...,0,0,1390,430510,0,-140.416672,0,0,1,0
1,2,11428577022,9260.630659,17036.969529,13129.978967,13180.83476,15801.09074,4445.420139,12091.404756,12781.70913,...,40300,1,1300,41732,0,-163.166672,0,0,0,0
2,3,11481674622,4837.305089,8031.548547,8130.96396,7984.299255,7644.212702,2273.861069,7101.412235,7670.058779,...,20000,0,500,169050,1,-130.666672,0,1,0,1
3,4,12889229422,9638.013075,16219.67039,13163.080201,12818.263387,12173.747482,13922.769296,5538.784662,14840.845358,...,0,0,0,6880,0,-165.333328,0,0,0,0
4,5,13309398822,5190.990802,8106.156293,7452.235473,9004.767596,2426.520981,2555.204602,7252.986835,9276.875285,...,15000,0,1200,400,0,,0,1,0,0


In [4]:
data.isnull().sum()

unique_id                0
ssuid                    0
repwgt0                  0
repwgt1                  0
repwgt2                  0
                      ... 
years_since_movein    2747
unemployed               0
out_lf                   0
born_abroad              0
likely_retired           0
Length: 324, dtype: int64

In [5]:
data = data.drop('years_since_movein', axis=1)

In [7]:
###----------------------split numerical and categorical data--------------------------------------###
num_column = data.select_dtypes(include=['int64', 'float64'])
cat_column = data.select_dtypes(include='object')

#### Data Preparation

Worked on a better method to filter out the replicate weight to properly align rows

In [8]:
num_column.head()

Unnamed: 0,unique_id,ssuid,repwgt0,repwgt1,repwgt2,repwgt3,repwgt4,repwgt5,repwgt6,repwgt7,...,secured_debt,unsecured_debt,unable_pay,mortgage_payment,asset_values,bachelors,unemployed,out_lf,born_abroad,likely_retired
0,1,11428546521,7426.116541,3272.646846,2984.807011,4144.434128,3603.779767,12749.500402,3534.087316,3677.640142,...,409000,0,0,1390,430510,0,0,0,1,0
1,2,11428577022,9260.630659,17036.969529,13129.978967,13180.83476,15801.09074,4445.420139,12091.404756,12781.70913,...,0,40300,1,1300,41732,0,0,0,0,0
2,3,11481674622,4837.305089,8031.548547,8130.96396,7984.299255,7644.212702,2273.861069,7101.412235,7670.058779,...,0,20000,0,500,169050,1,0,1,0,1
3,4,12889229422,9638.013075,16219.67039,13163.080201,12818.263387,12173.747482,13922.769296,5538.784662,14840.845358,...,0,0,0,0,6880,0,0,0,0,0
4,5,13309398822,5190.990802,8106.156293,7452.235473,9004.767596,2426.520981,2555.204602,7252.986835,9276.875285,...,0,15000,0,1200,400,0,0,1,0,0


In [9]:
num_column.isnull().sum()

unique_id         0
ssuid             0
repwgt0           0
repwgt1           0
repwgt2           0
                 ..
bachelors         0
unemployed        0
out_lf            0
born_abroad       0
likely_retired    0
Length: 317, dtype: int64

In [10]:
###--------------------find features-----------------------------###
# Assuming that our target variable is 'asset_values'
# this is only to showcase how to model the replicate weights
target = 'asset_values'
repl_weights = [col for col in num_column.columns if 'repwgt' in col]
features =  [col for col in num_column.columns if col not in repl_weights and col != target and 'area_state' not in col]

In [11]:
###-----------------Fit data on Primary Weight-----------------###
###-------------------Primary Weights---------------------------###
primary_weight = 'wpfinwgt'
X = num_column[features]
y = num_column[target]
sample_weight = num_column[primary_weight]

###------------------Split Data-------------------------------###
X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(X, y, sample_weight, test_size=0.3, random_state=42)

model = LinearRegression()
"Extra Coding process needed if doing cross-validation, or other hyperparametization"
"Might have to make process changes based on a different model."
model.fit(X_train, y_train, sample_weight=weights_train)

# Evaluate the model
# predict on the test set
baseline_predictions = model.predict(X_test)
baseline_mse = mean_squared_error(y_test, baseline_predictions)

print(f'Baseline Model MSE: {baseline_mse}')
print(f'Model Coefficients: {model.coef_}')
print(f'Intercept: {model.intercept_}')

Baseline Model MSE: 1.3619245732091876e-17
Model Coefficients: [ 8.36045858e-15  8.27180613e-25  5.24943158e-14  1.00000000e+00
  0.00000000e+00 -8.62624972e-16  5.07694077e-13 -5.17836841e-12
  3.22429557e-12 -1.26186026e-11  1.29309160e-10  4.98543644e-16
  3.25362982e-16  2.37011237e-16  1.41619216e-15  1.00000000e+00
  1.00000000e+00  4.59098527e-12 -3.15066900e-17  2.11580761e-11
  7.79448002e-12 -1.37981622e-11 -2.91155665e-11 -2.45205092e-11]
Intercept: 2.3283064365386963e-10


#### Extract Replicate Weights AND Values
This method extract the weights and values to optimally help us train the weights and X and Y values simultaneously.

In [12]:
repl_weights_columns = [f"repwgt{i}" for i in range(1, 241)]
replicate_weights = num_column[repl_weights_columns]

In [13]:
###-----------------------Check Shapes---------------------####
### Check Shapes of X and repl_weights
print(f"X shape: {X.shape}")
print(f"replicate_weights: {(replicate_weights).shape}")

X shape: (17438, 24)
replicate_weights: (17438, 240)


In [14]:
###--------------------------Train and Test Replicate Weights alongside the X and Y Values--------------------------------------###
X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(X, y, replicate_weights, test_size=0.3, random_state=42)

In [22]:
###----------------------Due to Multiple replicate weights per sample (10/sample to be precise) we could average them to prevent-----------###
###----------------------1D or Scalar Error---------------------------###
weights_train = weights_train.mean(axis=1)

In [18]:
weights_train.shape


(12206, 240)

In [21]:
X_train.shape

(12206, 24)

In [23]:
###-----------------Fit on Replicate Weights-----------------------###
repl_prod = []
model.fit(X_train, y_train, sample_weight=weights_train)
predictions = model.predict(X_test)
repl_prod.append(predictions)

In [24]:
###------------------Evaluate Model-----------------------------###
mse = mean_squared_error(y_test, predictions)
print(f"mean_squared_error: {mse}")

mean_squared_error: 1.1049669288079574e-17


In [28]:
# calculate variance across replicate predictions
prediction_variance = np.var(repl_prod)
# Average predictions across replicate models for final estimate.
final_predictions = np.mean(repl_prod)

print(f'Average Predictions for replicate weights: \n{final_predictions}')

Average Predictions for replicate weights: 
860479.2824923544
