In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib import colors
import pandas as pd
# import scipy as needed

In [3]:
#Import  and Election data

solar_raw = pd.read_csv("./deepsolar_tract.csv", encoding = "ISO-8859-1")

In [4]:
solar_grouped_temp = solar_raw.groupby(['county', 'state'], as_index=False)
solar_grouped_sumed = solar_grouped_temp.sum().sort_values(['county', 'state'])
solar_grouped_avged = solar_grouped_temp.mean().sort_values(['county', 'state'])
ratio_panels_to_housing_units = solar_grouped_sumed['tile_count'] / solar_grouped_sumed['housing_unit_count']
solar_grouped_sums = solar_grouped_sumed[['county', 'state', 'tile_count', 'solar_system_count', 'total_panel_area', 'land_area', 'total_area', 'population', 'education_high_school_graduate', 'education_bachelor', 'employed']].sort_values(['county', 'state'])
solar_grouped_avgs = solar_grouped_avged[['county', 'state', 'population_density', 'average_household_income', 'race_asian_rate', 'race_black_africa_rate', 'race_indian_alaska_rate', 'race_islander_rate', 'race_white_rate', 'race_other_rate', 'race_two_more_rate', 'diversity','voting_2016_dem_percentage', 'voting_2016_gop_percentage', 'avg_electricity_retail_rate', 'gini_index']].sort_values(['county', 'state'])

solar_grouped_all = pd.merge(solar_grouped_sums, solar_grouped_avgs, on=['county', 'state']).sort_values(['county', 'state'])
solar_grouped_all['ratio_panels_to_housing_units'] = ratio_panels_to_housing_units
solar_grouped_all['education_highschool_percentage'] = solar_grouped_all['education_high_school_graduate'] / solar_grouped_all['population']
solar_grouped_all['education_bachelor_percentage'] = solar_grouped_all['education_bachelor'] / solar_grouped_all['population']
solar_grouped_all['employed_percentage'] = solar_grouped_all['employed'] / solar_grouped_all['population']
solar_grouped_all = solar_grouped_all.drop(columns=['education_high_school_graduate', 'education_bachelor', 'employed'])
solar_grouped_all = solar_grouped_all.sort_values(['state', 'county'])

solar_grouped_all = solar_grouped_all.drop([2090])

solar_grouped_all.head()

Unnamed: 0,county,state,tile_count,solar_system_count,total_panel_area,land_area,total_area,population,population_density,average_household_income,...,race_two_more_rate,diversity,voting_2016_dem_percentage,voting_2016_gop_percentage,avg_electricity_retail_rate,gini_index,ratio_panels_to_housing_units,education_highschool_percentage,education_bachelor_percentage,employed_percentage
107,Autauga County,al,28.0,23.0,2567.54249,594.436015,604.388097,55221,810.895779,64064.29948,...,0.017455,0.342924,0.239569,0.734358,9.27,0.424858,0.00124,0.220604,0.08868,0.434364
116,Baldwin County,al,254.0,201.0,18351.453746,1589.783989,2027.31119,195121,430.466918,66657.723781,...,0.016964,0.213399,0.195653,0.773515,9.27,0.439061,0.002387,0.202085,0.135239,0.440511
128,Barbour County,al,8.0,6.0,1010.677426,884.876361,904.515217,26932,133.333457,44485.459066,...,0.011599,0.527576,0.466603,0.522714,9.27,0.457533,0.000677,0.240977,0.052428,0.319211
197,Bibb County,al,2.0,1.0,225.353902,622.5823,626.16879,22604,46.52071,52656.067494,...,0.01735,0.285977,0.21422,0.769662,9.27,0.42875,0.000223,0.290833,0.041851,0.366926
217,Blount County,al,18.0,6.0,960.822726,644.77589,650.62821,57710,93.144891,53897.00915,...,0.016832,0.09034,0.084699,0.898519,9.27,0.397744,0.000754,0.228366,0.058846,0.384491


In [5]:
solar_grouped_all.to_csv(r'./deepsolar_counties_cleaned.csv')

## Linear Regression
Linear regression to predict ratio of panels to housing units; using coefficient to determine what contributes most to the predictions.

In [27]:
from sklearn.model_selection import train_test_split
import sklearn.linear_model as lm


linear_regression_data = solar_grouped_all[['voting_2016_gop_percentage', 'race_indian_alaska_rate', 'race_asian_rate', 'race_black_africa_rate', 'race_islander_rate', 'race_two_more_rate', 'race_other_rate', 'diversity', 'gini_index', 'education_highschool_percentage', 'education_bachelor_percentage', 'employed_percentage', 'average_household_income', 'ratio_panels_to_housing_units']]

train, test = train_test_split(linear_regression_data, test_size=0.2)
train_y = train['ratio_panels_to_housing_units']
test_y = test['ratio_panels_to_housing_units']
train_x = train.drop(columns='ratio_panels_to_housing_units')
test_x = test.drop(columns='ratio_panels_to_housing_units')

In [29]:
model = lm.LinearRegression(fit_intercept=True)
model.fit(train_x, train_y)

fitted_y = model.predict(train_x)
predicted_y = model.predict(test_x)

In [30]:
def rmse(actual, predicted):
    return np.sqrt(np.mean((actual-predicted)**2))

In [31]:
training_error = rmse(train_y, fitted_y)
val_error = rmse(test_y, predicted_y)
(training_error, val_error)

(0.011159104202457219, 0.012539791369006117)

Our error is consistent across the training and testing data, so our model has no significant bias and any error is due to random chance.

In [32]:
d = {'features':train_x.columns, 'coefficients':model.coef_}
coefficients = pd.DataFrame(data=d)
coefficients

Unnamed: 0,features,coefficients
0,voting_2016_gop_percentage,-0.01190074
1,race_indian_alaska_rate,-0.01156294
2,race_asian_rate,0.1500907
3,race_black_africa_rate,-0.007132113
4,race_islander_rate,0.4039289
5,race_two_more_rate,0.01858654
6,race_other_rate,0.0664155
7,diversity,0.00280926
8,gini_index,-0.02120639
9,education_highschool_percentage,-0.007757804
