In [1]:
#import dependencies
import pandas as pd
from pathlib import Path
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import accuracy_score
import psycopg2

In [2]:
# #read in preprocessed data
# df = pd.read_csv(Path('Resources/homeless_edu.csv'))
# df.head()

In [3]:
# Create Connection
conn = psycopg2.connect(database="postgres", user="postgres", password="admin", host="127.0.0.1", port="5432")
print("Database opened successfully")

Database opened successfully


In [4]:
# Create a cursor object
cur = conn.cursor()

In [5]:
# A sample query of all data from the "homeless_edu" table in the "posgres" database 
cur.execute("""SELECT * FROM homeless_edu""")
homeless_edu = cur.fetchall()
print(homeless_edu)

[('2007_AL             ', 7069040, 7196459, 216941, 742919, '2007', 'AL', 8075, 3822, 11897), ('2007_AK             ', 1800616, 1938755, 42049, 131029, '2007', 'AK', 2995, 567, 3562), ('2007_AZ             ', 8724434, 8709531, 316376, 1087447, '2007', 'AZ', 17886, 14210, 32096), ('2007_AR             ', 4415981, 4779308, 138921, 479016, '2007', 'AR', 4979, 3545, 8524), ('2007_CA             ', 72516936, 73225422, 2011865, 6343471, '2007', 'CA', 102972, 215341, 318313), ('2007_CO             ', 7689014, 8039146, 236141, 801867, '2007', 'CO', 14346, 16154, 30500), ('2007_CT             ', 8913149, 8685421, 176592, 570626, '2007', 'CT', 7938, 2049, 9987), ('2007_DE             ', 1694929, 1705204, 37555, 122574, '2007', 'DE', 1784, 488, 2272), ('2007_DC             ', 1148358, 1140506, 20242, 78422, '2007', 'DC', 11399, 1001, 12400), ('2007_FL             ', 28898362, 29849892, 810952, 2666811, '2007', 'FL', 42900, 60701, 103601), ('2007_GA             ', 17755989, 17389530, 471012, 16495

In [6]:
# Create DataFrame
df = pd.DataFrame(homeless_edu, columns=["State_Year", "TOTAL_REVENUE", "TOTAL_EXPENDITURE", "GRADES_9_12_G", "GRADES_ALL_G", "Year", "State", "Sheltered_Cnt", "Unsheltered_Cnt", "Other_Cnt"])
df.head()

Unnamed: 0,State_Year,TOTAL_REVENUE,TOTAL_EXPENDITURE,GRADES_9_12_G,GRADES_ALL_G,Year,State,Sheltered_Cnt,Unsheltered_Cnt,Other_Cnt
0,2007_AL,7069040,7196459,216941,742919,2007,AL,8075,3822,11897
1,2007_AK,1800616,1938755,42049,131029,2007,AK,2995,567,3562
2,2007_AZ,8724434,8709531,316376,1087447,2007,AZ,17886,14210,32096
3,2007_AR,4415981,4779308,138921,479016,2007,AR,4979,3545,8524
4,2007_CA,72516936,73225422,2011865,6343471,2007,CA,102972,215341,318313


In [7]:
# Drop 'State_Year' Column
df = df.drop(columns=["State_Year"])
df.head()

Unnamed: 0,TOTAL_REVENUE,TOTAL_EXPENDITURE,GRADES_9_12_G,GRADES_ALL_G,Year,State,Sheltered_Cnt,Unsheltered_Cnt,Other_Cnt
0,7069040,7196459,216941,742919,2007,AL,8075,3822,11897
1,1800616,1938755,42049,131029,2007,AK,2995,567,3562
2,8724434,8709531,316376,1087447,2007,AZ,17886,14210,32096
3,4415981,4779308,138921,479016,2007,AR,4979,3545,8524
4,72516936,73225422,2011865,6343471,2007,CA,102972,215341,318313


In [8]:
# convert 'Year' to Integer
df["Year"].astype('int64')

0      2007
1      2007
2      2007
3      2007
4      2007
       ... 
505    2016
506    2016
507    2016
508    2016
509    2016
Name: Year, Length: 510, dtype: int64

In [9]:
#encode State
merged_encoded_df = pd.get_dummies(df, columns=["State"])

In [10]:
#create features 
X = merged_encoded_df.drop(columns=['Sheltered_Cnt','Unsheltered_Cnt','Other_Cnt'])
y = merged_encoded_df[['Sheltered_Cnt','Unsheltered_Cnt','Other_Cnt']]

In [11]:
#split in training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [12]:
#create encoder instance
scaler = StandardScaler()

In [13]:
#fit the scaler
X_scaler = scaler.fit(X_train)

In [14]:
#transform data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
#instantiate model
model = linear_model.LinearRegression()

In [16]:
#fit model
model.fit(X_train_scaled,y_train)

LinearRegression()

In [17]:
#generate predictions
y_pred = model.predict(X_test_scaled)

In [18]:
#get r2 value to determine if model is good fit
metrics.r2_score(y_test, y_pred)

0.9759796173355365

In [19]:
print(y_pred)

[[ 2.55465158e+03  1.02867775e+03  3.58332933e+03]
 [ 1.74298681e+04  1.46651037e+04  3.20949718e+04]
 [ 2.65827965e+04  3.25005177e+03  2.98328482e+04]
 [ 7.33183052e+03  3.20512405e+03  1.05369546e+04]
 [ 9.65230401e+04  2.05979407e+05  3.02502448e+05]
 [ 4.82815557e+03 -7.30786457e+02  4.09736911e+03]
 [ 3.64877767e+03 -3.99190571e+02  3.24958710e+03]
 [ 1.07009757e+04  8.92742718e+03  1.96284029e+04]
 [ 4.79395247e+04  6.78110310e+04  1.15750556e+05]
 [ 4.94208810e+04  3.93209025e+04  8.87417835e+04]
 [ 3.58461651e+03  9.35482475e+02  4.52009899e+03]
 [ 1.74213218e+04  1.04367967e+04  2.78581185e+04]
 [ 2.26192822e+03  2.11094555e+03  4.37287377e+03]
 [ 1.52356861e+04  6.36419179e+03  2.15998779e+04]
 [ 3.40571080e+03  1.12653884e+03  4.53224964e+03]
 [ 1.45849374e+04  2.40781707e+04  3.86631080e+04]
 [ 5.41122903e+03 -1.62618100e+02  5.24861093e+03]
 [ 2.04751522e+03  1.14399428e+03  3.19150950e+03]
 [ 1.56172673e+04  7.55249021e+03  2.31697575e+04]
 [ 8.29267440e+03  1.22739011e+