In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data_path = 'RJsKXWqDBZc3m0GG.csv'
data = pd.read_csv(data_path)

# One-hot encoding the categorical variables
encoder = OneHotEncoder(drop='first', sparse_output=False)  # Updated parameter
categorical_columns = ['Eduacation', 'Race', 'Hisp', 'MaritalStatus']
encoded_data = encoder.fit_transform(data[categorical_columns])

# Create a DataFrame with the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the encoded df with the original df and drop original categorical columns
data_preprocessed = pd.concat([data.drop(categorical_columns + ['Earnings_1974', 'Earnings_1975'], axis=1), encoded_df], axis=1)

# Prepare features and target variable
X = data_preprocessed.drop(['Earnings_1978'], axis=1)
y = data_preprocessed['Earnings_1978']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the earnings on the test set
y_pred = model.predict(X_test)

# Calculate RMSE and R-squared metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared: {r2}")


Root Mean Squared Error (RMSE): 9185.571360383461
R-squared: 0.09200104293622746


