In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import statsmodels.api as sm
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings("ignore")

np.random.seed(1947)

data = pd.read_csv("communities.csv")
data.head()

Unnamed: 0,state,county,community,communityname,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,8,,,Lakewoodcity,0.19,0.33,0.02,0.9,0.12,0.17,...,0.12,0.26,0.2,0.06,0.04,0.9,0.5,0.32,0.14,0.2
1,53,,,Tukwilacity,0.0,0.16,0.12,0.74,0.45,0.07,...,0.02,0.12,0.45,,,,,0.0,,0.67
2,24,,,Aberdeentown,0.0,0.42,0.49,0.56,0.17,0.04,...,0.01,0.21,0.02,,,,,0.0,,0.43
3,34,5.0,81440.0,Willingborotownship,0.04,0.77,1.0,0.08,0.12,0.1,...,0.02,0.39,0.28,,,,,0.0,,0.12
4,42,95.0,6096.0,Bethlehemtownship,0.01,0.55,0.02,0.95,0.09,0.05,...,0.04,0.09,0.02,,,,,0.0,,0.03


In [82]:
data.shape

(1994, 127)

In [83]:
data= data.drop('county', axis = 1)
data= data.drop('community', axis = 1)
data= data.drop('communityname', axis = 1)
data.shape

(1994, 124)

In [84]:
data.fillna(0, inplace=True)
data

Unnamed: 0,state,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,8,0.19,0.33,0.02,0.90,0.12,0.17,0.34,0.47,0.29,...,0.12,0.26,0.20,0.06,0.04,0.90,0.5,0.32,0.14,0.20
1,53,0.00,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,...,0.02,0.12,0.45,0.00,0.00,0.00,0.0,0.00,0.00,0.67
2,24,0.00,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,...,0.01,0.21,0.02,0.00,0.00,0.00,0.0,0.00,0.00,0.43
3,34,0.04,0.77,1.00,0.08,0.12,0.10,0.51,0.50,0.34,...,0.02,0.39,0.28,0.00,0.00,0.00,0.0,0.00,0.00,0.12
4,42,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,...,0.04,0.09,0.02,0.00,0.00,0.00,0.0,0.00,0.00,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,12,0.01,0.40,0.10,0.87,0.12,0.16,0.43,0.51,0.35,...,0.01,0.28,0.05,0.00,0.00,0.00,0.0,0.00,0.00,0.09
1990,6,0.05,0.96,0.46,0.28,0.83,0.32,0.69,0.86,0.73,...,0.02,0.37,0.20,0.00,0.00,0.00,0.0,0.00,0.00,0.45
1991,9,0.16,0.37,0.25,0.69,0.04,0.25,0.35,0.50,0.31,...,0.08,0.32,0.18,0.08,0.06,0.78,0.0,0.91,0.28,0.23
1992,25,0.08,0.51,0.06,0.87,0.22,0.10,0.58,0.74,0.63,...,0.03,0.38,0.33,0.02,0.02,0.79,0.0,0.22,0.18,0.19


In [85]:
data = pd.get_dummies(data)
labels  = np.array(data['ViolentCrimesPerPop'])
data = data.drop('ViolentCrimesPerPop', axis=1)
data.shape

(1994, 123)

In [86]:
train_x, test_x, train_y, test_y  = train_test_split(data, labels, test_size=0.25, random_state=18)
ols_reg = LinearRegression()
ols_reg.fit(train_x, train_y)
pred = ols_reg.predict(test_x.values)

print("MAE:", metrics.mean_absolute_error(test_y, pred))
print("MSE:", metrics.mean_squared_error(test_y, pred))
print("sqrt(MSE):", np.sqrt(metrics.mean_squared_error(test_y, pred)))

MAE: 0.09900506953171738
MSE: 0.019846428922350147
sqrt(MSE): 0.1408773541856538


In [87]:

train, test, y_train, y_test = train_test_split(data, labels, test_size=0.25, random_state=18)

scaler = StandardScaler()
scaler.fit(train_x)
train = scaler.transform(train_x)
test = scaler.transform(test_x)

In [88]:
pca = PCA(n_components=20)
pca.fit(train)

train = pca.transform(train)
test = pca.transform(test)

train = sm.add_constant(train)
test = sm.add_constant(test)

reg = sm.OLS(y_train, train).fit()
pred = reg.predict(test)

print(reg.summary())

print("MAE:", metrics.mean_absolute_error(y_test, pred))
print("MSE:", metrics.mean_squared_error(y_test, pred))
print("sqrt(MSE):", np.sqrt(metrics.mean_squared_error(y_test, pred)))

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.644
Model:                            OLS   Adj. R-squared:                  0.640
Method:                 Least Squares   F-statistic:                     133.6
Date:                Thu, 04 Feb 2021   Prob (F-statistic):          4.03e-313
Time:                        15:28:43   Log-Likelihood:                 828.89
No. Observations:                1495   AIC:                            -1616.
Df Residuals:                    1474   BIC:                            -1504.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2373      0.004     65.557      0.0

In [89]:
print(pca.components_)

[[-0.00913072  0.10144312 -0.00593623 ...  0.07983601  0.10005911
   0.08494691]
 [-0.07120768  0.11148244  0.04482915 ...  0.08258116  0.09652937
   0.1013575 ]
 [ 0.0450346   0.09619813 -0.18448019 ...  0.07996361  0.09659766
   0.09921582]
 ...
 [ 0.00148792 -0.03240044 -0.03266525 ...  0.01501008 -0.0423157
   0.04749608]
 [-0.01223085 -0.02312958 -0.03989852 ...  0.0184685   0.00718758
   0.00973887]
 [-0.09580449 -0.06712874 -0.01697719 ... -0.04653411 -0.08820194
   0.03110979]]


In [90]:
print(pca.components_[0])

[-0.00913072  0.10144312 -0.00593623  0.1053797  -0.12297854 -0.01479396
  0.07838346  0.04809225  0.07053231  0.05792637  0.01491892  0.09686512
 -0.01254489 -0.14085327 -0.08254515 -0.01591097 -0.14599712  0.0354381
  0.14114019 -0.02245231 -0.14114392 -0.1258704  -0.1013857  -0.09173674
 -0.03601268 -0.06540466 -0.05896102 -0.09810677  0.12301047  0.14605094
  0.11878544  0.13049775 -0.09870405  0.13411202 -0.0952424   0.00220268
 -0.01288839  0.09648987 -0.10884161  0.114852    0.07805549  0.12092135
  0.12059765  0.03118286 -0.15745035 -0.16074563 -0.15139931 -0.14301894
  0.00119055 -0.02699744  0.11677639  0.14615996  0.08524747  0.05599336
  0.06644419  0.06935088  0.07694613  0.04970893  0.05210422  0.05051501
  0.05224233 -0.06008012  0.07839167  0.08735194  0.06717829 -0.01723685
 -0.02808393  0.05280435 -0.1373835   0.10425547  0.12558462 -0.10068225
  0.10451502 -0.06758569 -0.12816381  0.1055612   0.01960699 -0.03538385
  0.12896119  0.10009249 -0.09222569 -0.08902261 -0.