In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [7]:
data = pd.read_csv('data/wage.csv')
data

Unnamed: 0,year,age,sex,maritl,race,education,region,jobclass,health,health_ins,logwage,wage
0,2006,18,1. Male,1. Never Married,1. White,1. < HS Grad,2. Middle Atlantic,1. Industrial,1. <=Good,2. No,4.318063,75.043154
1,2004,24,1. Male,1. Never Married,1. White,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,2. No,4.255273,70.476020
2,2003,45,1. Male,2. Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,1. <=Good,1. Yes,4.875061,130.982177
3,2003,43,1. Male,2. Married,3. Asian,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,1. Yes,5.041393,154.685293
4,2005,50,1. Male,4. Divorced,1. White,2. HS Grad,2. Middle Atlantic,2. Information,1. <=Good,1. Yes,4.318063,75.043154
...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2008,44,1. Male,2. Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,2. >=Very Good,1. Yes,5.041393,154.685293
2996,2007,30,1. Male,2. Married,1. White,2. HS Grad,2. Middle Atlantic,1. Industrial,2. >=Very Good,2. No,4.602060,99.689464
2997,2005,27,1. Male,2. Married,2. Black,1. < HS Grad,2. Middle Atlantic,1. Industrial,1. <=Good,2. No,4.193125,66.229408
2998,2005,27,1. Male,1. Never Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,2. >=Very Good,1. Yes,4.477121,87.981033


In [10]:
y = data['wage']
X = data.drop(columns=['wage'])

In [12]:
y

0        75.043154
1        70.476020
2       130.982177
3       154.685293
4        75.043154
           ...    
2995    154.685293
2996     99.689464
2997     66.229408
2998     87.981033
2999     90.481913
Name: wage, Length: 3000, dtype: float64

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
scale = StandardScaler()
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

X_train_num = scale.fit_transform(X_train.select_dtypes(include=[np.number]))
X_train_cat = ohe.fit_transform(X_train.select_dtypes(exclude=[np.number]))
X_test_num = scale.transform(X_test.select_dtypes(include=[np.number]))
X_test_cat = ohe.transform(X_test.select_dtypes(exclude=[np.number]))

X_train_processed = np.hstack((X_train_num, X_train_cat))
X_test_processed = np.hstack((X_test_num, X_test_cat))

In [40]:
X_train_processed

array([[ 0.61086637,  0.7549311 ,  2.76783685, ...,  0.        ,
         1.        ,  0.        ],
       [-1.36898412, -0.02606951,  0.46389006, ...,  1.        ,
         1.        ,  0.        ],
       [ 1.60079161, -1.41451505, -0.30873468, ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 0.61086637, -1.24095936, -0.49649973, ...,  1.        ,
         0.        ,  1.        ],
       [-0.37905887,  0.7549311 ,  0.12571745, ...,  1.        ,
         1.        ,  0.        ],
       [ 1.60079161,  1.18882033, -0.76830237, ...,  0.        ,
         0.        ,  1.        ]])

In [30]:
X_train_processed = pd.DataFrame(X_train_processed, columns=np.concatenate((X_train.select_dtypes(include=[np.number]).columns, ohe.get_feature_names_out())))
X_test_processed = pd.DataFrame(X_test_processed, columns=np.concatenate((X_test.select_dtypes(include=[np.number]).columns, ohe.get_feature_names_out())))

In [31]:
X_train_processed

Unnamed: 0,year,age,logwage,sex_1. Male,maritl_1. Never Married,maritl_2. Married,maritl_3. Widowed,maritl_4. Divorced,maritl_5. Separated,race_1. White,...,education_3. Some College,education_4. College Grad,education_5. Advanced Degree,region_2. Middle Atlantic,jobclass_1. Industrial,jobclass_2. Information,health_1. <=Good,health_2. >=Very Good,health_ins_1. Yes,health_ins_2. No
0,0.610866,0.754931,2.767837,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
1,-1.368984,-0.026070,0.463890,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
2,1.600792,-1.414515,-0.308735,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
3,0.115904,1.622710,0.970014,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
4,-1.368984,0.321042,-0.308735,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,1.600792,0.928487,0.125717,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
2396,1.600792,1.622710,0.426408,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
2397,0.610866,-1.240959,-0.496500,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
2398,-0.379059,0.754931,0.125717,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0


In [44]:
lr = LinearRegression()

lr.fit(X_train_processed, y_train)

# Evaluate the model
y_pred = lr.predict(X_test_processed)

# Since this is a regression problem, we will not use accuracy score or confusion matrix
print("Regression model coefficients:")
print(lr.coef_)
print("Regression model intercept:")
print(lr.intercept_)

# Find the mean squared error, R-squared, and other metrics
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")



Regression model coefficients:
[-0.41485746 -0.23587421 40.22064628  0.          1.80686184  0.31041305
 -2.43500791  0.79896295 -0.48122993  0.20043705 -0.14344559 -0.71732197
  0.66033051 -0.25710476 -1.71229597 -2.4475499  -0.52049023  4.93744086
  0.         -0.38927601  0.38927601  0.0736729  -0.0736729  -2.31859603
  2.31859603]
Regression model intercept:
112.60679794175027
Mean Squared Error: 109.62247845964252
R-squared: 0.9296849165939628


In [None]:
from sklearn.linear_model import PolynomialFeatures
