# Kmeans Clustering

In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium 
import calendar
import sklearn
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

from collections import Counter

import yellowbrick
from yellowbrick.cluster import KElbowVisualizer
pd.options.mode.chained_assignment = None

In [118]:
species = pd.read_csv('all_species_112222.csv')
species.drop(species.columns[species.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

locations = pd.read_csv('locations_112222.csv')
locations.drop(locations.columns[locations.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

weather = pd.read_csv('NPS_weather_trends_112222.csv')

In [119]:
activities = pd.read_csv('activities.csv')

In [120]:
def regioncolors(counter):
    if counter['k_cluster'] == 0:
        return 'darkblue'
    elif counter['k_cluster'] == 1:
        return 'darkred'
    elif counter['k_cluster'] == 2:
        return 'blue'
    elif counter['k_cluster'] == 3:
        return 'darkgreen'
    else:
        return 'darkpurple'

In [121]:
class Cluster:
    def __init__(self, park, month, sim_or_diff):
        self.park = park
        self.month = month
        self.sim_or_diff = sim_or_diff

        self.weather_sp = []
        self.KM = KMeans(n_clusters = 5, random_state = 42)
        #self.X = None
        #self.avg_10_yr = None
        self.transform_weather()
        self.get_species_df()
        self.transform_species()
        self.combine_weather_sp()

    def transform_weather(self):
        '''Clean and scale weather data'''
        park_weather = weather[['ParkName', 'Month', 'Year', 'Temp_Avg_Fahrenheit', 'Prcp_Avg_Inches']]
        avg_10_yr = park_weather.groupby(['ParkName','Month']).agg('mean').reset_index().drop(columns = 'Year')
        avg_10_yr.rename(columns = {'ParkName':'park', 'Month':'month','Temp_Avg_Fahrenheit':'temp', 'Prcp_Avg_Inches':'prcp'}, inplace = True)
        avg_10_yr['month_name'] = avg_10_yr['month'].apply(lambda x: calendar.month_abbr[x])

        avg_10_yr = avg_10_yr[avg_10_yr['month_name'] == self.month]

        clus_temp = avg_10_yr[['temp', 'prcp']]
        self.X = StandardScaler().fit_transform(clus_temp)
        self.avg_10_yr = avg_10_yr

    def get_species_df(self):
        '''Return df of parks and list of species'''
        # sort park names for future merging
        temp_merged = self.avg_10_yr.merge(locations, how = 'left', left_on = 'park', right_on = 'Park Name')
        sort_parks = temp_merged['Park Code'].tolist()
        # new df with only species that are present 
        present_sp = species[species['Occurrence'] == 'Present']
        similarity_df = present_sp[['Park Name', 'Scientific Name', 'Park Code']]
        sort_i = dict(zip(sort_parks, range(len(sort_parks))))
        similarity_df['park_code_ranked'] = similarity_df['Park Code'].map(sort_i)
        similarity_df.sort_values(by = ['park_code_ranked'], inplace = True)
        similarity_df.drop('park_code_ranked', axis = 1, inplace = True)

        # list of all park codes
        park_codes = list(similarity_df['Park Code'].unique())
        # list of all species in a park
        code_sp_list = []
        for code in park_codes:
            sp = list(similarity_df[similarity_df['Park Code'] == code]['Scientific Name'])
            #sp.append(code)
            code_sp_list.append(sp)

        # new park-species dataframe
        park_sp_df = pd.DataFrame()
        park_sp_df['park_code'] = park_codes
        park_sp_df['species_list'] = code_sp_list
        self.park_sp_df = park_sp_df
        #return self

    def transform_species(self):
        '''Transform species df'''
        # 0 if species is not present, 1 if species is present
        mlb = MultiLabelBinarizer()
        vec = mlb.fit_transform(self.park_sp_df['species_list'])
        vecs = pd.DataFrame(vec, columns=mlb.classes_)

        # apply cosine_similarity fxn on df
        df_cosine = pd.DataFrame(cosine_similarity(vecs,dense_output=True))

        # pca for dimensitonality reduction
        pca = PCA(n_components = 0.85)
        transform = pca.fit_transform(df_cosine)
        self.transform = transform
        #return self

    def combine_weather_sp(self):
        '''Combine weather and species vecs'''
        # concat weather and species vecs
        for i in range(len(self.park_sp_df)):
            concat = np.concatenate((self.X[i], self.transform[i]))
            self.weather_sp.append(concat)
        self.weather_sp_arr = np.array(self.weather_sp)[:62]
        #return self


    def kmeans(self):
        '''Perform K-means, return list of parks, map and silhouette score'''
        temp_labels = self.KM.fit_predict(self.weather_sp_arr)
        labs = np.unique(temp_labels)
        
        self.avg_10_yr['k_cluster'] = temp_labels
        temp_merged = self.avg_10_yr.merge(locations, how = 'left', left_on = 'park', right_on = 'Park Name')     
        temp_merged['color'] = temp_merged.apply(regioncolors, axis = 1)
        
        us_map = folium.Map(tiles='CartoDB positron', zoom_start=14)
        location_list = temp_merged[['Latitude', 'Longitude']].values.tolist()
        park_names = temp_merged['Park Name'].tolist()  
        for point in range(0, len(location_list)):
            folium.Marker(location_list[point],popup=park_names[point], icon=folium.Icon(color=temp_merged["color"][point], icon_color='white', icon='star', angle=0, prefix='fa')).add_to(us_map)
        display(us_map)
        sil_score = silhouette_score(self.weather_sp_arr, self.KM.fit_predict(self.weather_sp_arr)) # a good silhouette score should be > 0.5

        user_cluster = temp_merged[temp_merged['park'] == self.park]['k_cluster'].item()
        
        if self.sim_or_diff == 'Similar':
            user_parks = temp_merged[temp_merged['k_cluster'] == user_cluster]['park'].tolist()
        elif self.sim_or_diff == 'Different':
            user_parks = temp_merged[temp_merged['k_cluster'] != user_cluster]['park'].tolist()
        return user_cluster, user_parks, sil_score
    

In [122]:
Cluster('Joshua Tree National Park', 'Jan', 'Different').kmeans()

(1,
 ['Acadia National Park',
  'Biscayne National Park',
  'Channel Islands National Park',
  'Congaree National Park',
  'Crater Lake National Park',
  'Denali National Park',
  'Dry Tortugas National Park',
  'Everglades National Park',
  'Gates Of The Arctic National Park',
  'Glacier Bay National Park',
  'Glacier National Park',
  'Grand Teton National Park',
  'Great Smoky Mountains National Park',
  'Haleakala National Park',
  'Hawaii Volcanoes National Park',
  'Hot Springs National Park',
  'Isle Royale National Park',
  'Katmai National Park',
  'Kenai Fjords National Park',
  'Kings Canyon National Park',
  'Kobuk Valley National Park',
  'Lake Clark National Park',
  'Lassen Volcanic National Park',
  'Mammoth Cave National Park',
  'Mount Rainier National Park',
  'National Park of American Samoa',
  'New River Gorge National Park and Preserve',
  'North Cascades National Park',
  'Olympic National Park',
  'Pinnacles National Park',
  'Redwood National Park',
  'Sequoia

In [124]:
#months = []
#for i in range(1,13):
#    months.append(calendar.month_abbr[i])

#scores = {}
#for month in months:
#    score = Cluster('Acadia National Park', month, 'Similar').kmeans()[2]
#    scores[month] = score