In [1]:
# import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

In [2]:
# load the dataset from a CSV file into a pandas DataFrame
df = pd.read_csv('hatecrime.csv')

# print the first five data entries
print(df.head())

        state  median_household_income  share_unemployed_seasonal  \
0     Alabama                    42278                      0.060   
1      Alaska                    67629                      0.064   
2     Arizona                    49254                      0.063   
3    Arkansas                    44922                      0.052   
4  California                    60487                      0.059   

   share_population_in_metro_areas  share_population_with_high_school_degree  \
0                             0.64                                     0.821   
1                             0.63                                     0.914   
2                             0.90                                     0.842   
3                             0.69                                     0.824   
4                             0.97                                     0.806   

   share_non_citizen  share_white_poverty  gini_index  share_non_white  \
0               0.02          

In [3]:
# clean the dataseet of missing or NaN values
df = df.dropna()

# define the independent (feature) variables
X = df[['gini_index',
        'share_unemployed_seasonal', 
        'share_voters_voted_trump', 'share_population_in_metro_areas',
        'share_non_white']]

# define the dependent (target) variable
y = df['hate_crimes_per_100k_splc']

In [4]:
# add a constant term to the independent variables 'X'
X = sm.add_constant(X)

# initialize and fit the multiple linear regression model using the Ordinary Least Squares (OLS) method
model = sm.OLS(y, X).fit()

# print the summary statistics of the fitted model
print(model.summary())

# print the coefficients of the fitted model
print(model.params)

                                OLS Regression Results                               
Dep. Variable:     hate_crimes_per_100k_splc   R-squared:                       0.550
Model:                                   OLS   Adj. R-squared:                  0.492
Method:                        Least Squares   F-statistic:                     9.537
Date:                       Sat, 15 Feb 2025   Prob (F-statistic):           5.29e-06
Time:                               18:35:04   Log-Likelihood:                 16.728
No. Observations:                         45   AIC:                            -21.46
Df Residuals:                             39   BIC:                            -10.62
Df Model:                                  5                                         
Covariance Type:                   nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

In [5]:
# # generate a meshgrid for the 3D plot with 100 evenly spaced points between the features
# X_axis, Y_axis = np.meshgrid(
#     np.linspace(X['median_household_income'].min(), X['median_household_income'].max(), 100), 
#     np.linspace(X['gini_index'].min(), X['gini_index'].max(), 100)  
# )

# # calculate the Z-axis values using the model's parameters
# Z_axis = (
#     model.params.iloc[0]  
#     + model.params.iloc[1] * X_axis  
#     + model.params.iloc[2] * Y_axis  
# )

# # initialize a 3D plot
# fig = plt.figure(figsize=(12, 8))
# ax = fig.add_subplot(111, projection='3d')

# # plot the surface based on the meshgrid
# ax.plot_surface(X_axis, Y_axis, Z_axis, cmap='coolwarm', alpha=0.5, linewidth=0, antialiased=False)

# # add a scatter plot of the actual data points
# ax.scatter(df['median_household_income'], df['gini_index'], df['hate_crimes_per_100k_splc'], color='red')

# # label the axes to indicate what each axis represents
# ax.set_xlabel('Median Household Income')
# ax.set_ylabel('Gini Index')
# ax.set_zlabel('Hate Crimes per 100,000 Population')

# # add a title to the plot
# ax.set_title('3D Plot of Median Household Income and Gini Index vs. Hate Crimes per 100,000 Population')

# plt.show()