## Content
- **[Import Necessary Packages](#packages)**
- **[Baseline Feature Engineering](#bfeatures)**
- **[Baseline Modeling](#bmodeling)**
- **[Feature Engineering No.1](#features1)**
- **[Modeling No.1](#modeling1)**
- **[Feature Engineering No.2](#features2)**
- **[Modeling No.2](#modeling2)**
- **[Modeling No.3](#modeling3)**

### Import Necessary Packages <a id = 'packages'></a>

In [1]:
import numpy as np
import pandas as pd
from io import BytesIO
from zipfile import ZipFile
import urllib

from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.model_selection as m_sel
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor

#### Obtain the cleaned dataset

In [2]:
## Unzip and read file
url = urllib.request.urlopen('https://github.com/jonahwinninghoff/Springboard_Capstone_Project/raw/main/Assets/neat_data.zip')
file = ZipFile(BytesIO(url.read()))
cleaned_json = file.open("neat_data")
clean_df = pd.read_json(cleaned_json, encoding='cp1252')
file.close()

In [3]:
display(clean_df.dtypes)

Provider Name                                object
Provider Address                             object
Provider City                                object
Provider State                               object
Provider Zip Code                             int64
Measure Code                                  int64
Measure Description                          object
Resident type                                object
Q1 Measure Score                            float64
Q2 Measure Score                            float64
Q3 Measure Score                            float64
Q4 Measure Score                            float64
Four Quarter Average Score                  float64
Used in Quality Measure Five Star Rating     object
Location                                     object
dtype: object

### Baseline Feature Engineering <a id = 'bfeatures'></a>

In [4]:
## Vectorize Measure Description
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 0.25, min_df = 5)
vectorized_array = vectorizer.fit_transform(clean_df['Measure Description']).toarray()
names_list = vectorizer.get_feature_names()

In [5]:
## Add new right columns
df = pd.DataFrame(vectorized_array.tolist()) # Coerce array into dataframe

columns_renamed = {}                         # Create renamed column dictionary
for i in enumerate(names_list):
    columns_renamed[i[0]] = i[1]
cleanup = df.rename(columns = columns_renamed) # 0, 1, ..., 51 -> ability, activities, ..., worsened

cleaned_df = clean_df.merge(cleanup,         # Inner Join Tfidf with cleaned_df on index
                              left_index=True, 
                              right_index=True, 
                              how = 'inner')

In [6]:
cleaned_df['Binary Score'] = np.where(cleaned_df['Four Quarter Average Score'] <= 50, 0, 1)

In [7]:
## Select key columns for this model
names_list.extend(['Binary Score'])
dataset = cleaned_df[names_list]

In [8]:
print(len(dataset.columns))

45


### Baseline Modeling <a id = 'bmodeling'></a>

In [9]:
# Split testing, validating, and training
dataset_X, dataset_y = dataset.iloc[:,:-1], dataset.iloc[:,-1]
X_train, X_test, y_train, y_test = m_sel.train_test_split(dataset_X, dataset_y, 
                                                    test_size=0.33, 
                                                    random_state=666)

In [10]:
# Create 10 folds of cross validation and use logit regression
model = LogisticRegression()
cv = m_sel.KFold(n_splits=10, random_state=444, shuffle=True)
print(model.fit(X_train, y_train))

LogisticRegression()


In [11]:
# Create the 10 fold average metrics
thenames = ['Accuracy','Precision','Mean Absolute Error','Brier Score']
scorings = ['accuracy', 'average_precision','neg_mean_absolute_error','neg_mean_squared_error']
scores = {}

for i in enumerate(thenames):
    scores[thenames[i[0]]] = m_sel.cross_val_score(model, 
                                                   X_train, y_train,
                                                   cv=cv, scoring=scorings[i[0]])
    scores[thenames[i[0]]] = np.absolute(round(np.mean(scores[thenames[i[0]]]),3))

In [12]:
print(scores)

{'Accuracy': 0.679, 'Precision': 0.319, 'Mean Absolute Error': 0.321, 'Brier Score': 0.321}


## Feature Engineering No.1 <a id ='features1'></a>

In [13]:
## Remove max_df and min_df
vectorizer = TfidfVectorizer(stop_words = 'english')
vectorized_array = vectorizer.fit_transform(clean_df['Measure Description']).toarray()
names_list = vectorizer.get_feature_names()

In [14]:
## Add new right columns
df = pd.DataFrame(vectorized_array.tolist()) # Coerce array into dataframe

columns_renamed = {}                         # Create renamed column dictionary
for i in enumerate(names_list):
    columns_renamed[i[0]] = i[1]
cleanup = df.rename(columns = columns_renamed) # 0, 1, ..., 51 -> ability, activities, ..., worsened

cleaned_df1 = clean_df.merge(cleanup,         # Inner Join Tfidf with cleaned_df on index
                              left_index=True, 
                              right_index=True, 
                              how = 'inner')

In [15]:
cleaned_df1['Binary Score'] = np.where(cleaned_df1['Four Quarter Average Score'] <= 50, 0, 1)

In [16]:
## Select key columns for this model
names_list.extend(['Binary Score'])
dataset1 = cleaned_df1[names_list]

In [17]:
print(len(dataset1.columns))

53


## Modeling No.1 <a id = 'modeling1'></a>

In [18]:
# Split testing, validating, and training
dataset1_X, dataset1_y = dataset1.iloc[:,:-1], dataset1.iloc[:,-1]
X_train1, X_test1, y_train1, y_test1 = m_sel.train_test_split(dataset1_X, dataset1_y, 
                                                    test_size=0.33, 
                                                    random_state=666)

In [19]:
# Create 10 folds of cross validation and use logit regression
model1 = LogisticRegression()
cv = m_sel.KFold(n_splits=10, random_state=444, shuffle=True)
print(model1.fit(X_train1, y_train1))

LogisticRegression()


In [20]:
# Create the 10 fold average metrics
thenames1 = ['Accuracy','Precision','Mean Absolute Error','Brier Score']
scorings1 = ['accuracy', 'average_precision','neg_mean_absolute_error','neg_mean_squared_error']
scores1 = {}

for i in enumerate(thenames):
    scores1[thenames1[i[0]]] = m_sel.cross_val_score(model1, 
                                                   X_train1, y_train1,
                                                   cv=cv, scoring=scorings1[i[0]])
    scores1[thenames1[i[0]]] = np.absolute(round(np.mean(scores1[thenames1[i[0]]]),3))

In [21]:
print(scores1)

{'Accuracy': 0.679, 'Precision': 0.319, 'Mean Absolute Error': 0.321, 'Brier Score': 0.321}


## Feature Engineering No.2 <a id='features2'></a>

In [22]:
## Vectorize Measure Description
vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 0.25, min_df = 5)
vectorized_array = vectorizer.fit_transform(clean_df['Measure Description']).toarray()
names_list = vectorizer.get_feature_names()

In [23]:
## Add new right columns
df = pd.DataFrame(vectorized_array.tolist()) # Coerce array into dataframe

columns_renamed = {}                         # Create renamed column dictionary
for i in enumerate(names_list):
    columns_renamed[i[0]] = i[1]
cleanup = df.rename(columns = columns_renamed) # 0, 1, ..., 51 -> ability, activities, ..., worsened

cleaned_df2 = clean_df.merge(cleanup,         # Inner Join Tfidf with cleaned_df on index
                              left_index=True, 
                              right_index=True, 
                              how = 'inner')

In [24]:
## Move from 100-0 to 1-0
cleaned_df2['Continuous Score'] = cleaned_df2['Four Quarter Average Score'].copy()/100

In [25]:
## Select key columns for this model
names_list.extend(['Continuous Score'])
dataset2 = cleaned_df2[names_list]

## Modeling No.2 <a id = 'modeling2'></a>

In [26]:
# Split testing, validating, and training
dataset2_X, dataset2_y = dataset2.iloc[:,:-1], dataset2.iloc[:,-1]
X_train2, X_test2, y_train2, y_test2 = m_sel.train_test_split(dataset2_X, dataset2_y, 
                                                    test_size=0.33, 
                                                    random_state=666)

In [27]:
# Create 10 folds of cross validation and use logit regression
model2 = LinearRegression()
cv = m_sel.KFold(n_splits=10, random_state=444, shuffle=True)
print(model2.fit(X_train2, y_train2))

LinearRegression()


In [28]:
# Create the 10 fold average metrics
thenames2 = ['Accuracy','Mean Absolute Error','Brier Score']
scorings2 = ['r2','neg_mean_absolute_error','neg_mean_squared_error']
scores2 = {}

for i in enumerate(thenames2):
    scores2[thenames2[i[0]]] = m_sel.cross_val_score(model2, 
                                                   X_train2, y_train2,
                                                   cv=cv, scoring=scorings2[i[0]])
    scores2[thenames2[i[0]]] = np.absolute(round(np.mean(scores2[thenames2[i[0]]]),3))

In [29]:
print(scores2)

{'Accuracy': 0.0, 'Mean Absolute Error': 0.327, 'Brier Score': 0.135}


## Modeling No.3 <a id='modeling3'></a>

In [30]:
# Split testing, validating, and training
dataset3_X, dataset3_y = dataset2.iloc[:,:-1], dataset2.iloc[:,-1]
X_train3, X_test3, y_train3, y_test3 = m_sel.train_test_split(dataset3_X, dataset3_y, 
                                                    test_size=0.33, 
                                                    random_state=666)

In [31]:
# Create 10 folds of cross validation and use logit regression
model3 = RandomForestRegressor()
cv = m_sel.KFold(n_splits=10, random_state=444, shuffle=True)
print(model3.fit(X_train3, y_train3))

RandomForestRegressor()


In [32]:
# Create the 10 fold average metrics
thenames3 = ['Accuracy','Mean Absolute Error','Brier Score']
scorings3 = ['r2','neg_mean_absolute_error','neg_mean_squared_error']
scores3 = {}

for i in enumerate(thenames2):
    scores3[thenames2[i[0]]] = m_sel.cross_val_score(model3, 
                                                   X_train3, y_train3,
                                                   cv=cv, scoring=scorings3[i[0]])
    scores3[thenames2[i[0]]] = np.absolute(round(np.mean(scores3[thenames3[i[0]]]),3))

In [33]:
print(scores3)

{'Accuracy': 0.0, 'Mean Absolute Error': 0.327, 'Brier Score': 0.135}
