In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

In [2]:
file_path = './Model.xlsx'
sheet_name = 'Trimmed_Data'
data = pd.read_excel(file_path, sheet_name=sheet_name)

In [3]:
print(data.columns)

Index(['TOTEX', 'TOINC', 'URB', 'FSIZE', 'emp_status', 'RPROV'], dtype='object')


In [4]:
X_Categorical = data[["URB", "emp_status"]]
X_Num = data[["TOINC", "FSIZE"]]
y = data["TOTEX"]

In [5]:
X_Categorical_encoded = pd.get_dummies(X_Categorical, drop_first=True)

In [6]:
X_nopoly = pd.concat([X_Num, X_Categorical_encoded], axis=1)

In [7]:
poly = PolynomialFeatures(degree=2, include_bias=False)  # Adjust degree as needed
X_poly = poly.fit_transform(X_nopoly)
X_all = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X_nopoly.columns))

In [8]:
selected_features = ['TOINC', 'FSIZE', 'URB','emp_status']

X = X_all[selected_features]

In [9]:
corr_matrix = X.corr()

# Display the correlation matrix
print("Correlation Matrix:")
print(corr_matrix)

Correlation Matrix:
               TOINC     FSIZE       URB  emp_status     RPROV
TOINC       1.000000  0.230243 -0.291632    0.196482  0.050848
FSIZE       0.230243  1.000000  0.015359    0.202455  0.002473
URB        -0.291632  0.015359  1.000000   -0.100913 -0.153588
emp_status  0.196482  0.202455 -0.100913    1.000000 -0.008849
RPROV       0.050848  0.002473 -0.153588   -0.008849  1.000000


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
model = LinearRegression()
model.fit(X_train, y_train)

In [12]:
predictions = model.predict(X_test)

In [13]:
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
})
print("Model Coefficients:")
print(coefficients)

r2_score = model.score(X_test, y_test)
print(f"R^2 Score on Test Set: {r2_score}")


Model Coefficients:
      Feature   Coefficient
0       TOINC      0.637144
1       FSIZE   4659.959251
2         URB -12939.840326
3  emp_status  -7721.229610
4       RPROV      1.261501
R^2 Score on Test Set: 0.7838658846854991


In [14]:
intercept = model.intercept_
coefficients = model.coef_

# Example: Combining into a single output
print("Full Model:")
print(f"Constant: {intercept}")
for feature, coef in zip(["TOINC", "FSIZE", "URB", "emp_status"], coefficients):
    print(f"{feature}: {coef}")

Full Model:
Constant: 39339.5356323272
TOINC: 0.6371444178278649
FSIZE: 4659.959250591819
URB: -12939.840326317624
emp_status: -7721.2296099513205


In [15]:
y_pred = model.predict(X_test)
comparison_df = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted (Scikit-learn)": y_pred
})


comparison_df["Residuals (Scikit-learn)"] = comparison_df["Actual"] - comparison_df["Predicted (Scikit-learn)"]


print(comparison_df)

         Actual  Predicted (Scikit-learn)  Residuals (Scikit-learn)
0      191125.0             187054.401666               4070.598334
1      360385.0             440080.258958             -79695.258958
2      379591.0             458279.474699             -78688.474699
3      347928.0             379121.275491             -31193.275491
4       79297.0              98123.428208             -18826.428208
...         ...                       ...                       ...
29490   68817.0              84958.184891             -16141.184891
29491  214037.0             156133.315855              57903.684145
29492  195583.0             184872.844648              10710.155352
29493  183618.0             185336.742308              -1718.742308
29494  260921.0             257410.587241               3510.412759

[29495 rows x 3 columns]


In [16]:
comparison_df.to_csv('predictions.csv', index=False)