Import csvs and Create model for new case counts

In [185]:
# Import the modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


In [186]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
cancer_df = pd.read_csv(
    "State_Complete_Info.csv")

#Create list of column names for iteration
col_list =  list(cancer_df)

#Iterate through columns to delete NA rows
for i in col_list:
    ny_cancer_df = ny_cancer_df.drop(ny_cancer_df[ny_cancer_df[i] == 'no data/suppressed'].index)
    ny_cancer_df = ny_cancer_df.drop(ny_cancer_df[ny_cancer_df[i] == 'Data not presented'].index)

cancer_df = cancer_df.drop(columns=["Unnamed: 0"] )
cancer_df = cancer_df.drop(columns=["FIPS"] )


# Review the DataFrame
cancer_df.head()

Unnamed: 0,County,Income,Uninsured %,White Poverty %,Black Poverty %,Asian Poverty %,Foreign Poverty %,Native Poverty %,Hispanic Poverty %,Non Hispanic Poverty %,Area,Age-Adjusted New Case Rate,Case Count,Population,Age-Adjusted Death Rate,Death Count
1,Aleutians West,90708,3.8,24.5,3.5,47.5,45.3,10.5,9.7,90.3,Alaska,286.2,51,28538,114.2,19
2,Anchorage,88871,5.4,60.3,5.4,12.6,11.0,7.2,9.5,90.5,Alaska,444.0,6278,1460452,146.1,1878
3,Bethel,57460,18.1,9.6,0.9,1.6,2.3,84.5,2.1,97.9,Alaska,367.5,242,91317,195.0,104
6,Dillingham,62115,12.6,16.2,1.3,0.3,2.5,73.4,3.3,96.7,Alaska,418.0,92,24670,201.4,41
7,Fairbanks North Star,78321,6.3,73.0,4.6,3.9,5.1,7.6,8.3,91.7,Alaska,402.7,1782,492276,127.3,505


In [187]:
#View end of dataframe due to size
cancer_df.tail()

Unnamed: 0,County,Income,Uninsured %,White Poverty %,Black Poverty %,Asian Poverty %,Foreign Poverty %,Native Poverty %,Hispanic Poverty %,Non Hispanic Poverty %,Area,Age-Adjusted New Case Rate,Case Count,Population,Age-Adjusted Death Rate,Death Count
433,Newton,38116,10.4,74.2,20.5,0.7,0.8,0.5,4.1,95.9,Texas,424.3,428,68942,166.4,169
434,Foard,37679,10.1,74.8,0.1,0.8,3.6,0.7,21.6,78.4,Texas,541.5,45,5867,189.0,17
436,Falls,35317,8.1,65.4,22.6,1.7,4.9,0.4,24.3,75.7,Texas,413.8,481,86403,171.5,204
440,Hudspeth,32404,7.5,46.7,1.4,0.8,42.6,1.0,77.7,22.3,Texas,300.9,75,23437,112.1,28
443,Dimmit,25000,5.9,60.4,0.2,1.6,10.2,0.2,87.8,12.2,Texas,348.8,198,51162,128.9,78


In [188]:
#Ensure columns are correct type for model training and testing
cancer_df["Black Poverty %"] = cancer_df["Black Poverty %"].astype(float)
cancer_df["Asian Poverty %"] = cancer_df["Asian Poverty %"].astype(float)
cancer_df["Foreign Poverty %"] = cancer_df["Foreign Poverty %"].astype(float)
cancer_df["Population"] = cancer_df["Population"].astype(float)
cancer_df["Case Count"] =cancer_df["Case Count"].astype(float)

In [189]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = cancer_df["Case Count"]

# Separate the X variable, the features
X = cancer_df.drop(columns=["Case Count","County","Age-Adjusted New Case Rate","Age-Adjusted Death Rate","Death Count","Area"])

In [190]:
#Display X
X.head()

Unnamed: 0,Income,Uninsured %,White Poverty %,Black Poverty %,Asian Poverty %,Foreign Poverty %,Native Poverty %,Hispanic Poverty %,Non Hispanic Poverty %,Population
1,90708,3.8,24.5,3.5,47.5,45.3,10.5,9.7,90.3,28538.0
2,88871,5.4,60.3,5.4,12.6,11.0,7.2,9.5,90.5,1460452.0
3,57460,18.1,9.6,0.9,1.6,2.3,84.5,2.1,97.9,91317.0
6,62115,12.6,16.2,1.3,0.3,2.5,73.4,3.3,96.7,24670.0
7,78321,6.3,73.0,4.6,3.9,5.1,7.6,8.3,91.7,492276.0


In [191]:
#Split data into testing and training
x_train, x_test, y_train, y_test= train_test_split(X, y, random_state=100)

In [192]:
#Create and fit regression model
mlr= LinearRegression()  
mlr.fit(x_train, y_train)

In [193]:
#Print the model coefficients
print(mlr.intercept_)
# pair the feature names with the coefficients
list(zip(X, mlr.coef_))

-67594.03099687841


[('Income', 0.014729263572046583),
 ('Uninsured %', 4.12937945958157),
 ('White Poverty %', 31.003487497992683),
 ('Black Poverty %', 33.229053354793756),
 ('Asian Poverty %', -119.93973159667388),
 ('Foreign Poverty %', 172.96988119937174),
 ('Native Poverty %', 4.58105535756219),
 ('Hispanic Poverty %', 596.6671619199188),
 ('Non Hispanic Poverty %', 650.1686079652899),
 ('Population', 0.004061120146197084)]

In [194]:
#Predicting the Test and Train set result 
y_pred_mlr= mlr.predict(x_test)  
x_pred_mlr= mlr.predict(x_train)  

In [196]:
#Actual value and the predicted value
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff

Unnamed: 0,Actual value,Predicted value
175,4801.0,5714.121308
193,14541.0,18294.804232
146,14646.0,11965.960534
111,849.0,1822.559359
230,144.0,272.169070
...,...,...
39,1510.0,2363.661521
307,6445.0,5933.245820
55,1002.0,1950.475021
59,286.0,-698.726190


In [197]:
#View R squared
print('R squared value of the model: {:.2f}'.format(mlr.score(X,y)*100))


R squared value of the model: 97.57


In [198]:
#Copy cancer df for adding prediction values
cancer_df_pred = cancer_df.copy()
cancer_df_pred.head()

Unnamed: 0,County,Income,Uninsured %,White Poverty %,Black Poverty %,Asian Poverty %,Foreign Poverty %,Native Poverty %,Hispanic Poverty %,Non Hispanic Poverty %,Area,Age-Adjusted New Case Rate,Case Count,Population,Age-Adjusted Death Rate,Death Count
1,Aleutians West,90708,3.8,24.5,3.5,47.5,45.3,10.5,9.7,90.3,Alaska,286.2,51.0,28538.0,114.2,19
2,Anchorage,88871,5.4,60.3,5.4,12.6,11.0,7.2,9.5,90.5,Alaska,444.0,6278.0,1460452.0,146.1,1878
3,Bethel,57460,18.1,9.6,0.9,1.6,2.3,84.5,2.1,97.9,Alaska,367.5,242.0,91317.0,195.0,104
6,Dillingham,62115,12.6,16.2,1.3,0.3,2.5,73.4,3.3,96.7,Alaska,418.0,92.0,24670.0,201.4,41
7,Fairbanks North Star,78321,6.3,73.0,4.6,3.9,5.1,7.6,8.3,91.7,Alaska,402.7,1782.0,492276.0,127.3,505


In [199]:
#Ensure columns are correct type
cancer_df_pred["Income"] = cancer_df_pred["Income"].astype(float)
cancer_df_pred["Uninsured %"] = cancer_df_pred["Uninsured %"].astype(float)
cancer_df_pred["Hispanic Poverty %"] = cancer_df_pred["Hispanic Poverty %"].astype(float)
cancer_df_pred["Native Poverty %"] = cancer_df_pred["Hispanic Poverty %"].astype(float)

In [200]:
#Apply linear equation to df
cancer_df_pred["Case Count Prediction"] = (-67594.03099687841
+(cancer_df_pred["Income"]*0.014729263572046583)
+(cancer_df_pred["Uninsured %"]* 4.12937945958157)
+(cancer_df_pred["White Poverty %"]*31.003487497992683)
+(cancer_df_pred["Black Poverty %"]*33.229053354793756)
+(cancer_df_pred["Asian Poverty %"]*-119.93973159667388)
+(cancer_df_pred["Foreign Poverty %"]*172.96988119937174)
+(cancer_df_pred["Native Poverty %"]*4.58105535756219)
+(cancer_df_pred["Hispanic Poverty %"]*596.6671619199188)
+(cancer_df_pred["Non Hispanic Poverty %"]*650.1686079652899)
+(cancer_df_pred["Population"]*0.004061120146197084))

#Display new values
cancer_df_pred.head()

Unnamed: 0,County,Income,Uninsured %,White Poverty %,Black Poverty %,Asian Poverty %,Foreign Poverty %,Native Poverty %,Hispanic Poverty %,Non Hispanic Poverty %,Area,Age-Adjusted New Case Rate,Case Count,Population,Age-Adjusted Death Rate,Death Count,Case Count Prediction
1,Aleutians West,90708.0,3.8,24.5,3.5,47.5,45.3,9.7,9.7,90.3,Alaska,286.2,51.0,28538.0,114.2,19,1430.237437
2,Anchorage,88871.0,5.4,60.3,5.4,12.6,11.0,9.5,9.5,90.5,Alaska,444.0,6278.0,1460452.0,146.1,1878,6660.835419
3,Bethel,57460.0,18.1,9.6,0.9,1.6,2.3,2.1,2.1,97.9,Alaska,367.5,242.0,91317.0,195.0,104,-854.501675
6,Dillingham,62115.0,12.6,16.2,1.3,0.3,2.5,3.3,3.3,96.7,Alaska,418.0,92.0,24670.0,201.4,41,-729.584217
7,Fairbanks North Star,78321.0,6.3,73.0,4.6,3.9,5.1,8.3,8.3,91.7,Alaska,402.7,1782.0,492276.0,127.3,505,3026.097955


Create model for death count and apply predictions

In [201]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = cancer_df["Death Count"]

# Separate the X variable, the features
X = cancer_df.drop(columns=["Case Count","County","Age-Adjusted New Case Rate","Age-Adjusted Death Rate","Death Count","Area"])

In [202]:
#Split into testing and training data
x_train, x_test, y_train, y_test= train_test_split(X, y, random_state=100)

In [203]:
#Create and fit linear regression
mlr= LinearRegression()  
mlr.fit(x_train, y_train)

In [204]:
#Printing the model coefficients
print(mlr.intercept_)
# pair the feature names with the coefficients
list(zip(X, mlr.coef_))

-21790.48840896343


[('Income', -0.0028118617785045565),
 ('Uninsured %', -4.604909994401377),
 ('White Poverty %', 3.7391084949703277),
 ('Black Poverty %', -0.8597759711297859),
 ('Asian Poverty %', -45.2315947850284),
 ('Foreign Poverty %', 44.79531105916161),
 ('Native Poverty %', -3.7969228486537507),
 ('Hispanic Poverty %', 202.52004205254227),
 ('Non Hispanic Poverty %', 221.59543017564212),
 ('Population', 0.0014333004319890819)]

In [205]:
#Predicting the Test and Train set result 
y_pred_mlr= mlr.predict(x_test)  
x_pred_mlr= mlr.predict(x_train)

In [207]:
#Actual value and the predicted value
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff

Unnamed: 0,Actual value,Predicted value
175,1883,1923.779742
193,4032,5759.853965
146,4888,3919.245106
111,336,665.331782
230,50,-37.306291
...,...,...
39,546,867.063150
307,2316,2147.374571
55,291,721.914743
59,110,-119.474546


In [208]:
#Print R squared
print('R squared value of the model: {:.2f}'.format(mlr.score(X,y)*100))


R squared value of the model: 97.69


In [209]:
#Apply linear regression to df
cancer_df_pred["Death Count Prediction"] = (-21790.48840896343
                                            +(cancer_df_pred["Income"]*-0.0028118617785045565)
                                            +(cancer_df_pred["Uninsured %"]*-4.604909994401377)
                                            +(cancer_df_pred["White Poverty %"]*3.7391084949703277)
                                            +(cancer_df_pred["Black Poverty %"]*-0.8597759711297859)
                                            +(cancer_df_pred["Asian Poverty %"]*-45.2315947850284)
                                            +(cancer_df_pred["Foreign Poverty %"]*44.79531105916161)
                                            +(cancer_df_pred["Native Poverty %"]*-3.7969228486537507)
                                            +(cancer_df_pred["Hispanic Poverty %"]*202.52004205254227)
                                            +(cancer_df_pred["Non Hispanic Poverty %"]*221.59543017564212)
                                            +(cancer_df_pred["Population"]*0.0014333004319890819))
cancer_df_pred.head()

Unnamed: 0,County,Income,Uninsured %,White Poverty %,Black Poverty %,Asian Poverty %,Foreign Poverty %,Native Poverty %,Hispanic Poverty %,Non Hispanic Poverty %,Area,Age-Adjusted New Case Rate,Case Count,Population,Age-Adjusted Death Rate,Death Count,Case Count Prediction,Death Count Prediction
1,Aleutians West,90708.0,3.8,24.5,3.5,47.5,45.3,9.7,9.7,90.3,Alaska,286.2,51.0,28538.0,114.2,19,1430.237437,-115.134515
2,Anchorage,88871.0,5.4,60.3,5.4,12.6,11.0,9.5,9.5,90.5,Alaska,444.0,6278.0,1460452.0,146.1,1878,6660.835419,2113.930434
3,Bethel,57460.0,18.1,9.6,0.9,1.6,2.3,2.1,2.1,97.9,Alaska,367.5,242.0,91317.0,195.0,104,-854.501675,272.769309
6,Dillingham,62115.0,12.6,16.2,1.3,0.3,2.5,3.3,3.3,96.7,Alaska,418.0,92.0,24670.0,201.4,41,-729.584217,254.129492
7,Fairbanks North Star,78321.0,6.3,73.0,4.6,3.9,5.1,8.3,8.3,91.7,Alaska,402.7,1782.0,492276.0,127.3,505,3026.097955,956.607889


In [210]:
#Export cancer predictions to csv
cancer_df_pred.to_csv('Cancer_Prediction.csv', encoding='utf-8')


Testing optimization

In [228]:
# Reread the CSV file from the Resources folder into a Pandas DataFrame
cancer_df = pd.read_csv(
    "State_Complete_Info.csv")

#Create list of column names for iteration
col_list =  list(cancer_df)

#Iterate through columns to delete NA rows
for i in col_list:
    cancer_df = cancer_df.drop(cancer_df[cancer_df[i] == 'no data/suppressed'].index)
    cancer_df = cancer_df.drop(cancer_df[cancer_df[i] == 'Data not presented'].index)

# Drop unneeded columns
cancer_df = cancer_df.drop(columns=["Unnamed: 0"] )
cancer_df = cancer_df.drop(columns=["FIPS"] )

#Drop columns with low coefficients
cancer_df = cancer_df.drop(columns=["Income"] )
cancer_df = cancer_df.drop(columns=["Native Poverty %"] )
cancer_df = cancer_df.drop(columns=["Population"] )

# Review the DataFrame
cancer_df.head()

Unnamed: 0,County,Uninsured %,White Poverty %,Black Poverty %,Asian Poverty %,Foreign Poverty %,Hispanic Poverty %,Non Hispanic Poverty %,Area,Age-Adjusted New Case Rate,Case Count,Age-Adjusted Death Rate,Death Count
1,Aleutians West,3.8,24.5,3.5,47.5,45.3,9.7,90.3,Alaska,286.2,51,114.2,19
2,Anchorage,5.4,60.3,5.4,12.6,11.0,9.5,90.5,Alaska,444.0,6278,146.1,1878
3,Bethel,18.1,9.6,0.9,1.6,2.3,2.1,97.9,Alaska,367.5,242,195.0,104
6,Dillingham,12.6,16.2,1.3,0.3,2.5,3.3,96.7,Alaska,418.0,92,201.4,41
7,Fairbanks North Star,6.3,73.0,4.6,3.9,5.1,8.3,91.7,Alaska,402.7,1782,127.3,505


In [229]:
cancer_df["Black Poverty %"] = cancer_df["Black Poverty %"].astype(float)
cancer_df["Case Count"] =cancer_df["Case Count"].astype(float)

In [230]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = cancer_df["Case Count"]

# Separate the X variable, the features
X = cancer_df.drop(columns=["Case Count","County","Age-Adjusted New Case Rate","Age-Adjusted Death Rate","Death Count","Area"])

In [231]:
#Display X
X.head()

Unnamed: 0,Uninsured %,White Poverty %,Black Poverty %,Asian Poverty %,Foreign Poverty %,Hispanic Poverty %,Non Hispanic Poverty %
1,3.8,24.5,3.5,47.5,45.3,9.7,90.3
2,5.4,60.3,5.4,12.6,11.0,9.5,90.5
3,18.1,9.6,0.9,1.6,2.3,2.1,97.9
6,12.6,16.2,1.3,0.3,2.5,3.3,96.7
7,6.3,73.0,4.6,3.9,5.1,8.3,91.7


In [232]:
#Split into training and testing data
x_train, x_test, y_train, y_test= train_test_split(X, y, random_state=100)

In [233]:
#Create and fit linear regression
mlr= LinearRegression()  
mlr.fit(x_train, y_train)

In [234]:
#Printing the model coefficients
print(mlr.intercept_)
# pair the feature names with the coefficients
list(zip(X, mlr.coef_))

93880.78908419352


[('Uninsured %', 318.5682815064017),
 ('White Poverty %', 19.471431267856943),
 ('Black Poverty %', 262.3300856703778),
 ('Asian Poverty %', 225.12324699733125),
 ('Foreign Poverty %', 1011.7789207550861),
 ('Hispanic Poverty %', -1111.2086699960553),
 ('Non Hispanic Poverty %', -1013.8992646557439)]

In [235]:
#Predicting the Test and Train set result 
y_pred_mlr= mlr.predict(x_test)  
x_pred_mlr= mlr.predict(x_train)  

In [237]:
#Actual value and the predicted value
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff

Unnamed: 0,Actual value,Predicted value
175,4801.0,19794.185576
193,14541.0,32364.082390
146,14646.0,10669.023908
111,849.0,2983.762554
230,144.0,4972.207062
...,...,...
39,1510.0,-812.333244
307,6445.0,622.846379
55,1002.0,-2454.871190
59,286.0,-776.787236


In [238]:
#PRint R squared
print('R squared value of the model: {:.2f}'.format(mlr.score(X,y)*100))


R squared value of the model: 28.77


Removing the low end coeffieicnts brought down the R squared value significantly

Import New York CSVs for predictions on new data

In [239]:
# Read in New York CSV file from the Resources folder into a Pandas DataFrame
ny_cancer_df = pd.read_csv(
    "New_york_Complete.csv")

#Create list of column names for iteration
col_list =  list(cancer_df)

#Iterate through columns to delete NA rows
for i in col_list:
    ny_cancer_df = ny_cancer_df.drop(ny_cancer_df[ny_cancer_df[i] == 'no data/suppressed'].index)
    ny_cancer_df = ny_cancer_df.drop(ny_cancer_df[ny_cancer_df[i] == 'Data not presented'].index)


ny_cancer_df = ny_cancer_df.drop(columns=["Unnamed: 0"] )
ny_cancer_df = ny_cancer_df.drop(columns=["FIPS"] )


# Review the DataFrame
ny_cancer_df.head()

Unnamed: 0,County,Income,Uninsured %,White Poverty %,Black Poverty %,Asian Poverty %,Foreign Poverty %,Native Poverty %,Hispanic Poverty %,Non Hispanic Poverty %,Area,Age-Adjusted New Case Rate,Case Count,Population,Age-Adjusted Death Rate,Death Count
0,Nassau,126576.0,4.5,64.6,11.5,10.5,22.4,0.3,17.2,82.8,New York,499.4,44670.0,6778157.0,124.2,11820.0
1,Suffolk,111660.0,4.8,76.3,7.5,4.1,15.5,0.3,19.9,80.1,New York,522.0,49985.0,7405434.0,139.2,13708.0
2,Putnam,111617.0,4.8,83.2,3.5,2.1,13.9,0.2,16.2,83.8,New York,479.1,3184.0,493568.0,140.3,928.0
3,Westchester,105387.0,6.1,59.9,14.6,6.1,25.3,0.4,25.2,74.8,New York,463.3,28837.0,4843691.0,120.1,7960.0
4,Rockland,99707.0,6.0,69.4,11.5,6.0,21.0,0.2,18.1,81.9,New York,468.1,8833.0,1626064.0,120.0,2380.0


In [223]:
#Ensure columns are correct type
ny_cancer_df["Black Poverty %"] = ny_cancer_df["Black Poverty %"].astype(float)
ny_cancer_df["Asian Poverty %"] = ny_cancer_df["Asian Poverty %"].astype(float)
ny_cancer_df["Foreign Poverty %"] = ny_cancer_df["Foreign Poverty %"].astype(float)
ny_cancer_df["Population"] = ny_cancer_df["Population"].astype(float)
ny_cancer_df["Case Count"] =ny_cancer_df["Case Count"].astype(float)

In [224]:
#Copy ny df for predictions
ny_cancer_df_pred = ny_cancer_df.copy()
ny_cancer_df_pred.head()

Unnamed: 0,County,Income,Uninsured %,White Poverty %,Black Poverty %,Asian Poverty %,Foreign Poverty %,Native Poverty %,Hispanic Poverty %,Non Hispanic Poverty %,Area,Age-Adjusted New Case Rate,Case Count,Population,Age-Adjusted Death Rate,Death Count
0,Nassau,126576.0,4.5,64.6,11.5,10.5,22.4,0.3,17.2,82.8,New York,499.4,44670.0,6778157.0,124.2,11820.0
1,Suffolk,111660.0,4.8,76.3,7.5,4.1,15.5,0.3,19.9,80.1,New York,522.0,49985.0,7405434.0,139.2,13708.0
2,Putnam,111617.0,4.8,83.2,3.5,2.1,13.9,0.2,16.2,83.8,New York,479.1,3184.0,493568.0,140.3,928.0
3,Westchester,105387.0,6.1,59.9,14.6,6.1,25.3,0.4,25.2,74.8,New York,463.3,28837.0,4843691.0,120.1,7960.0
4,Rockland,99707.0,6.0,69.4,11.5,6.0,21.0,0.2,18.1,81.9,New York,468.1,8833.0,1626064.0,120.0,2380.0


In [225]:
#Apply linear equation to df
ny_cancer_df_pred["Case Count Prediction"] = (-67594.03099687841
+(ny_cancer_df_pred["Income"]*0.014729263572046583)
+(ny_cancer_df_pred["Uninsured %"]* 4.12937945958157)
+(ny_cancer_df_pred["White Poverty %"]*31.003487497992683)
+(ny_cancer_df_pred["Black Poverty %"]*33.229053354793756)
+(ny_cancer_df_pred["Asian Poverty %"]*-119.93973159667388)
+(ny_cancer_df_pred["Foreign Poverty %"]*172.96988119937174)
+(ny_cancer_df_pred["Native Poverty %"]*4.58105535756219)
+(ny_cancer_df_pred["Hispanic Poverty %"]*596.6671619199188)
+(ny_cancer_df_pred["Non Hispanic Poverty %"]*650.1686079652899)
+(ny_cancer_df_pred["Population"]*0.004061120146197084))

ny_cancer_df_pred.head()

Unnamed: 0,County,Income,Uninsured %,White Poverty %,Black Poverty %,Asian Poverty %,Foreign Poverty %,Native Poverty %,Hispanic Poverty %,Non Hispanic Poverty %,Area,Age-Adjusted New Case Rate,Case Count,Population,Age-Adjusted Death Rate,Death Count,Case Count Prediction
0,Nassau,126576.0,4.5,64.6,11.5,10.5,22.4,0.3,17.2,82.8,New York,499.4,44670.0,6778157.0,124.2,11820.0,30913.960228
1,Suffolk,111660.0,4.8,76.3,7.5,4.1,15.5,0.3,19.9,80.1,New York,522.0,49985.0,7405434.0,139.2,13708.0,32902.437396
2,Putnam,111617.0,4.8,83.2,3.5,2.1,13.9,0.2,16.2,83.8,New York,479.1,3184.0,493568.0,140.3,928.0,5073.518526
3,Westchester,105387.0,6.1,59.9,14.6,6.1,25.3,0.4,25.2,74.8,New York,463.3,28837.0,4843691.0,120.1,7960.0,23311.45771
4,Rockland,99707.0,6.0,69.4,11.5,6.0,21.0,0.2,18.1,81.9,New York,468.1,8833.0,1626064.0,120.0,2380.0,9998.903328


In [226]:
#Apply linear equation to df

ny_cancer_df_pred["Death Count Prediction"] = (-21790.48840896343
                                            +(ny_cancer_df_pred["Income"]*-0.0028118617785045565)
                                            +(ny_cancer_df_pred["Uninsured %"]*-4.604909994401377)
                                            +(ny_cancer_df_pred["White Poverty %"]*3.7391084949703277)
                                            +(ny_cancer_df_pred["Black Poverty %"]*-0.8597759711297859)
                                            +(ny_cancer_df_pred["Asian Poverty %"]*-45.2315947850284)
                                            +(ny_cancer_df_pred["Foreign Poverty %"]*44.79531105916161)
                                            +(ny_cancer_df_pred["Native Poverty %"]*-3.7969228486537507)
                                            +(ny_cancer_df_pred["Hispanic Poverty %"]*202.52004205254227)
                                            +(ny_cancer_df_pred["Non Hispanic Poverty %"]*221.59543017564212)
                                            +(ny_cancer_df_pred["Population"]*0.0014333004319890819))
ny_cancer_df_pred.head()

Unnamed: 0,County,Income,Uninsured %,White Poverty %,Black Poverty %,Asian Poverty %,Foreign Poverty %,Native Poverty %,Hispanic Poverty %,Non Hispanic Poverty %,Area,Age-Adjusted New Case Rate,Case Count,Population,Age-Adjusted Death Rate,Death Count,Case Count Prediction,Death Count Prediction
0,Nassau,126576.0,4.5,64.6,11.5,10.5,22.4,0.3,17.2,82.8,New York,499.4,44670.0,6778157.0,124.2,11820.0,30913.960228,10138.460108
1,Suffolk,111660.0,4.8,76.3,7.5,4.1,15.5,0.3,19.9,80.1,New York,522.0,49985.0,7405434.0,139.2,13708.0,32902.437396,11054.174446
2,Putnam,111617.0,4.8,83.2,3.5,2.1,13.9,0.2,16.2,83.8,New York,479.1,3184.0,493568.0,140.3,928.0,5073.518526,1266.503106
3,Westchester,105387.0,6.1,59.9,14.6,6.1,25.3,0.4,25.2,74.8,New York,463.3,28837.0,4843691.0,120.1,7960.0,23311.45771,7573.705345
4,Rockland,99707.0,6.0,69.4,11.5,6.0,21.0,0.2,18.1,81.9,New York,468.1,8833.0,1626064.0,120.0,2380.0,9998.903328,2964.59584


In [227]:
#Export NY df predictions to csv
ny_cancer_df_pred.to_csv('NY_Cancer_Prediction.csv', encoding='utf-8')
