## Update Nearest Neighbors Model to Incorporate New Features
1. Load newly merged data
2. Scale numeric data
3. Apply nearest neigbhors model to scaled dataframe
4. Create nearest/recommendations column
5. Clean
6. Test
7. Load data to database

### 1. Load newly merged data

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [None]:
df = pd.read_csv('csv/walk_and_school_score_df.csv')

In [None]:
print(df.shape)
df.head()

### 2. Scale numeric data

In [None]:
numeric = df.select_dtypes(['number'])

In [None]:
numeric = numeric.drop(columns=['lat', 'lon'])

In [None]:
print(numeric.shape)
numeric.head()

### Apply Standard Scaler

In [None]:
# Instantiate encoder/scaler
scaler = StandardScaler()

In [None]:
 standard_df = scaler.fit_transform(numeric)

In [None]:
standard_df = pd.DataFrame(standard_df, columns=numeric.columns)

In [None]:
print(standard_df.shape)
standard_df.head()

### 3. Apply nearest neigbhors model to scaled dataframe

In [None]:
nn = NearestNeighbors(n_neighbors=6, algorithm='kd_tree', n_jobs=8)
nn.fit(standard_df)

In [None]:
def nearest(idx):
    return ','.join(map(str, nn.kneighbors([standard_df.iloc[idx]])[1][0][1:].tolist()))

### 4. Create nearest/recommendations column

In [None]:
df['Index'] = df.index

In [None]:
df['Nearest'] = df['Index'].apply(nearest) 

### 5. Clean

In [None]:
# Changing to percent for consistency
change_to_percent = ['Diversity Index', '% Private', '% Public', '% Charter', '% Performing Above Average or Better' ]
df[change_to_percent] = df[change_to_percent]*100

In [None]:
df = df[['City', 'State', 'lat', 'lon', 'TotalPop', 'Men', 'Women', 
        'Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific', 'Diversity Index',
        'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 
        'Poverty', 'ChildPoverty',
        'Employed', 'Unemployment', 'PrivateWork', 'PublicWork', 'SelfEmployed', 'FamilyWork', 
        'Professional', 'Service', 'Office', 'Construction', 'Production', 
        'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute', 
        'Rent', 'Year', 
        'Population','Violent crime', 'Murder and nonnegligent manslaughter', 'Rape',
        'Robbery', 'Aggravated assault', 'Property crime', 'Burglary',
        'Larceny- theft', 'Motor vehicle theft', 'Arson', 'Crime Rate per 1000', 'Crime Rating', 
        'Days with AQI', 'Good Days', 'Moderate Days',
        'Unhealthy for Sensitive Groups Days', 'Unhealthy Days',
        'Very Unhealthy Days', 'Hazardous Days', 'Max AQI',
        '90th Percentile AQI', 'Median AQI', 'Days CO', 'Days NO2',
        'Days Ozone', 'Days SO2', 'Days PM2.5', 'Days PM10', 'Air Quality Index', 
        'Walk Score', 'Transit Score', 'Bike Score', 'Walkability',
        'Public Transportation', 'Bikeability', 
        'Total_Schools', 'Private', 'Public district', 'Public charter', 
        '% Private', '% Public', '% Charter', 'Above Average or Better',
        '% Performing Above Average or Better', 'Nearest']]

In [None]:
df = df.rename(columns = {'% Private': 'Percent Private', 
                          '% Public': 'Percent Public', 
                          '% Charter': 'Percent Charter',
                          '% Performing Above Average or Better': 'Percent Performing Above Average or Better'
                          })

In [None]:
print(df.shape)
df.head()

### 6. Test

### Akron, OH
54,123,135,336,93

In [None]:
df[0:1]

In [None]:
df[54:55]

In [None]:
df[123:124]

In [None]:
df[135:136]

In [None]:
df[336:337]

In [None]:
df[93:94]

### New York, NY
67,151,195,292,295

In [None]:
df[233:234]

In [None]:
df[67:68]

In [None]:
df[151:152]

In [None]:
df[195:196]

In [None]:
df[292:293]  

In [None]:
df[295:296]

### Save

In [None]:
df.to_csv('csv/final.csv', index=False)

### 7. Save to Database

In [None]:
!pip install sqlalchemy psycopg2-binary

In [None]:
DATABASE_URL = 'postgresql://qnkdeomqdplyhn:524dd57201f23c59f7cada0c9e06e4e593ef05cb818b021257698416b6255d7c@ec2-54-164-238-108.compute-1.amazonaws.com:5432/d2a6u1uh8alkre'


In [None]:
import sqlalchemy

engine = sqlalchemy.create_engine(DATABASE_URL)
connection = engine.connect()

In [None]:
df = pd.read_csv('csv/final.csv')

In [None]:
df.to_sql('mytable', con=engine, method='multi', if_exists='replace')

### Check

In [None]:
pd.read_sql('mytable', con=engine)