In [100]:
# Import all the stuff we need
import pandas as pd
import numpy as np
import sklearn
import sklearn.svm

In [101]:
# Define a few constants:

# Threshold for determining outbreak
thrs = 3

In [102]:
# Function to help with visualization
def show_more_cols_head(data, num_cols=100):
    # Store the previous number of columns
    prev_maxcols = pd.options.display.max_columns
    # Set the number of columns to the max temporarily
    pd.options.display.max_columns = num_cols
    # Show the stuff
    print data.head()
    # Change it back to the previous value
    pd.options.display.max_columns = prev_maxcols
    # Done
    return

In [103]:
# Create data frames from CSVs
student_df = pd.read_csv("../Project/california-kindergarten-immunization-rates/StudentData.csv")
pertusis_df = pd.read_csv("../Project/california-kindergarten-immunization-rates/pertusisRates2010_2015.csv")
infant_df = pd.read_csv("../Project/california-kindergarten-immunization-rates/InfantData.csv")
geo_df = pd.read_csv("../Project/california-kindergarten-immunization-rates/geoData.csv")

In [104]:
# Get collection of the counties and years
counties =  student_df.COUNTY.unique()
years = student_df.year.unique()

In [105]:
# Get the differences between years and the sign of the amount (so we can see if it was trending positive or negative)
pertusis_df['2011_diff'] = pertusis_df['Rate2011'] - pertusis_df['Rate2010']
pertusis_df['2011_sgn'] = np.sign(pertusis_df['2011_diff'] - thrs)
pertusis_df['2012_diff'] = pertusis_df['Rate2012'] - pertusis_df['Rate2011']
pertusis_df['2012_sgn'] = np.sign(pertusis_df['2012_diff'] - thrs)
pertusis_df['2013_diff'] = pertusis_df['Rate2013'] - pertusis_df['Rate2012']
pertusis_df['2013_sgn'] = np.sign(pertusis_df['2013_diff'] - thrs)
pertusis_df['2014_diff'] = pertusis_df['Rate2014'] - pertusis_df['Rate2013']
pertusis_df['2014_sgn'] = np.sign(pertusis_df['2014_diff'] - thrs)


In [106]:
# Combine infant and pertusis data
concat = pd.merge(pertusis_df, infant_df, left_on='county', right_on='COUNTY')
# Delete new county data so we only have one
concat = concat.drop(['COUNTY'], axis=1)
# Also drop California, as it is not a county
concat = concat[concat.county != ('CALIFORNIA')]

In [107]:
# Display part of the data to see what's up
show_more_cols_head(concat)

      county  Cases2010  Rate2010  Cases2011  Rate2011  Cases2012  Rate2012  \
0    ALAMEDA        423     30.21        206     14.59         62      4.35   
1     ALPINE          0      0.00          0      0.00          0      0.00   
2     AMADOR          4     10.56         11     29.48          1      2.74   
3      BUTTE         32     14.55         16      7.26          3      1.36   
4  CALAVERAS          9     19.80          5     11.08          0      0.00   

   Cases2013  Rate2013  Cases2014  Rate2014  2011_diff  2011_sgn  2012_diff  \
0        124      8.55        354     24.41     -15.62      -1.0     -10.24   
1          0      0.00          0      0.00       0.00      -1.0       0.00   
2          2      5.52          1      2.76      18.92       1.0     -26.74   
3         33     14.87         30     13.52      -7.29      -1.0      -5.90   
4          2      4.39         17     37.35      -8.72      -1.0     -11.08   

   2012_sgn  2013_diff  2013_sgn  2014_diff  2014_

In [108]:
# Gather X and Y data
# For now, X holds the difference in rates from 2011-2012 and 2012-2013
# We need to grab the stuff as a matrix
X = concat[['Cases2012', 'Rate2012', 'Cases2013', 'Rate2013']].as_matrix()
# Y is predicting whether there was an outbreak in the number of pertussis cases
y = np.ravel(concat['2014_sgn'].as_matrix()[np.newaxis].T)

# printX

# Create SVM model
svm_model = sklearn.svm.SVC(kernel='rbf')

# Choose number of folds for kfolds:
n_fld = 3
kf = sklearn.model_selection.KFold(n_fld)
sklearn.model_selection.KFold(n_fld, random_state=None, shuffle=False)


cf = np.zeros([2, 2])
# # Confusion matrix stuff
for train_index, test_index in kf.split(X):
    # For each fold, split  X and y into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Try fitting the SVM model on the data
    svm_model.fit(X_train, y_train)
    # Predict using the test data, and compare to the actual values
    y_pred = svm_model.predict(X_test)
    cf = cf + sklearn.metrics.confusion_matrix(y_test, y_pred)


print cf

# Compute accuracy
acc = cf.trace()/cf.sum()
print acc

[[  0.  12.]
 [  0.  46.]]
0.793103448276
