# K Means Clustering Project

In [2]:
# For this project we will attempt to use KMeans Clustering to cluster
# Universities into to two groups, Private and Public.

# Note, we actually have the labels for this data set, 
# but we will NOT use them for the KMeans clustering algorithm, 
# since that is an unsupervised learning algorithm.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Read in the data file setting the first column as index
college_data = pd.read_csv('College_Data',index_col=0)

In [None]:
college_data.head()

In [None]:
college_data.info()
# See DataFrame information

In [None]:
college_data.describe()
# Review number of initial aggregation results

## Exploratory Data Analysis

In [None]:
# Create a scatterplot of Grad.Rate versus Room.Board 
# where the points are colored by the Private column.
sns.scatterplot(data=college_data,
                x='Room.Board',
                y='Grad.Rate',
                hue='Private')

In [None]:
# Create a scatterplot of F.Undergrad versus Outstate 
# where the points are colored by the Private column.
sns.scatterplot(data=college_data,
                x='F.Undergrad',
                y='Outstate',
                hue='Private')

In [None]:
# Stack histograms showing Out of State Tuition based on the
# Private status column
plt.figure(figsize=(12,6))
sns.set_style('darkgrid')
sns.distplot(college_data[college_data['Private']=='Yes']['Outstate'],kde=False)
sns.distplot(college_data[college_data['Private']=='No']['Outstate'],kde=False)

In [None]:
# Create a similar histogram for the Grad.Rate column.
plt.figure(figsize=(12,6))
sns.set_style('darkgrid')
sns.distplot(college_data[college_data['Private']=='Yes']['Grad.Rate'],kde=False)
sns.distplot(college_data[college_data['Private']=='No']['Grad.Rate'],kde=False)

In [None]:
# Noted that there is apparently a school with graduation rate > 100%
college_data['Grad.Rate'].idxmax()

In [None]:
college_data.loc['Cazenovia College']

In [None]:
# Set grad rate to 100 for data integrity

In [None]:
pd.DataFrame.replace(college_data,to_replace=college_data['Grad.Rate']['Cazenovia College'],value=100,inplace=True)


In [None]:
college_data.loc['Cazenovia College']

## K Means Cluster Creation

In [None]:
from sklearn.cluster import KMeans

In [None]:
# Create an instance of a K Means model with 2 clusters
kmeans = KMeans(n_clusters=2)

In [None]:
# Fit the model to all the data except for the Private label.
kmeans.fit(college_data.drop('Private',axis=1))

In [None]:
# Find cluster center vectors
kmeans.cluster_centers_

## Evaluation

In [None]:
# Because this is a non real world situation. We have labels to evaluate
# our clusters.
# Would be very unlikely to happen in real situation

def converter(private):
    if private=='Yes':
        return 1
    else:
        return 0

In [None]:
college_data['Cluster'] = college_data['Private'].apply(converter)

In [None]:
college_data.drop('Private',axis=1)

In [None]:
# Create a confusion matrix and classification report to see how 
# well the Kmeans clustering worked without being given 
# any labels.
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(college_data['Cluster'],kmeans.labels_))
print(classification_report(college_data['Cluster'],kmeans.labels_))

In [None]:
kmeans.labels_

In [None]:
college_data['Cluster']