In [22]:
import pandas as pd
import numpy as np
from sklearn import linear_model as lm

#set csv to vars
taxData = pd.read_csv("sales-and-use-tax.csv")
popData = pd.read_csv("popVermontTowns.csv", header=1)

# clean pop data
popData = popData.drop('CTC', axis=1)

#data engineering steps
# prep for merging
# reshape the pop data to three columns name, calendar year, and population
popData2 = popData.melt(id_vars=['NAME'], var_name="Calendar Year", value_name="Population")

# change types of pop data to aline with tax data
popData2 = popData2.astype({'Calendar Year': 'int64', 'Population': 'int32'})

# merge pop data and tax data
mergedData = taxData.merge(popData2, left_on=["Town", "Calendar Year"], right_on=["NAME", "Calendar Year"], how='inner')

#feature we want to predict
targetName = 'Gross'
DataFeatures = mergedData.drop(targetName, axis=1)
# Data mining steps - Using standard deviation
mostImportantFeature = DataFeatures.std().sort_values(ascending=False)[0:1]

# Testing out choice from Data Mining Results
# Setting up input arrays for test
# Find most important feature, by our data mining methodology
featureName = mostImportantFeature.keys()[0]

xb = np.array(mergedData[featureName])
xb = xb.reshape(-1,1)
yb = np.array(mergedData[targetName])
yb = yb.reshape(-1,1)
bestModel = lm.LinearRegression().fit(xb, yb)
bestModelScore = bestModel.score(xb,yb)

# Showing the worst case from Data Mining Results
#xw = np.array(mergedData['Calendar Year'])
#xw = xw.reshape(-1,1)
#yw = np.array(mergedData['Gross'])
#yw = yw.reshape(-1,1)
#worstModel = lm.LinearRegression().fit(xw, yw)
#worstModelScore = worstModel.score(xw,yw)

#if the regression score is above threshold
BestDataModel = DataFeatures[featureName]

print(BestDataModel)

0      4.890067e+05
1      1.857137e+06
2      1.431322e+07
3      1.672038e+07
4      9.368863e+07
           ...     
780    3.834357e+08
781    3.381616e+07
782    5.526718e+06
783    1.867886e+06
784    2.302616e+07
Name: Retail, Length: 785, dtype: float64


retail has biggest standard deviation of 5.0e+07

year has the smallest std of 1.13