In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
import hdbscan
import seaborn as sns
sns.set_style("dark")



In [2]:
pd.set_option('max_colwidth', 1000)

# Load Data

Here, I'm reading in the imputed data set Julia put together, and pulling in some metadata from the non-imputed data.

In [3]:
data = pd.read_csv('data/scorecard_reduced_features.csv')
imputed = pd.read_csv('data/scorecard_imputed.csv')
imputed['UNITID'] = data.UNITID

Set the index to the UNITID, and inspect the dataframes:

In [4]:
data.set_index('UNITID', inplace=True)
data.head()

Unnamed: 0_level_0,INSTNM,ZIP,HCM2,CONTROL,LOCALE,CCBASIC,CCUGPROF,CCSIZSET,HBCU,PBI,...,LO_INC_RPY_3YR_RT_SUPP,MD_INC_RPY_3YR_RT_SUPP,HI_INC_RPY_3YR_RT_SUPP,NONCOM_RPY_3YR_RT_SUPP,FIRSTGEN_RPY_3YR_RT_SUPP,PCT_LIBERAL_ARTS,PCT_PROFESSIONAL,PCT_RELIGIOUS,COST,PCT_VOCATIONAL
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100654,Alabama A & M University,35762,0.0,1.0,12.0,18.0,9.0,14.0,1.0,0.0,...,0.448163,0.446488,0.422018,0.37379,0.423581,0.5346,0.4074,0.0,18888.0,0.0629
100663,University of Alabama at Birmingham,35294-0110,0.0,1.0,12.0,15.0,8.0,15.0,0.0,0.0,...,0.724256,0.770134,0.818059,0.678749,0.751553,0.4379,0.5805,0.0095,19990.0,0.0
100690,Amridge University,36117-3553,0.0,2.0,12.0,21.0,6.0,6.0,0.0,0.0,...,0.597701,0.709091,,0.613281,0.632653,0.6301,0.1096,0.2603,12300.0,0.0
100706,University of Alabama in Huntsville,35899,0.0,1.0,12.0,15.0,8.0,12.0,0.0,0.0,...,0.721404,0.824742,0.851936,0.689788,0.778157,0.2764,0.7152,0.0082,20306.0,0.0
100724,Alabama State University,36104-0271,0.0,1.0,12.0,18.0,9.0,13.0,1.0,0.0,...,0.310875,0.376106,0.333333,0.315997,0.315737,0.5449,0.5733,0.0,17400.0,0.0


In [5]:
imputed.set_index('UNITID', inplace=True)
imputed.head()

Unnamed: 0_level_0,HCM2,CONTROL,LOCALE,CCBASIC,CCUGPROF,CCSIZSET,HBCU,PBI,ANNHI,TRIBAL,...,LO_INC_RPY_3YR_RT_SUPP,MD_INC_RPY_3YR_RT_SUPP,HI_INC_RPY_3YR_RT_SUPP,NONCOM_RPY_3YR_RT_SUPP,FIRSTGEN_RPY_3YR_RT_SUPP,PCT_LIBERAL_ARTS,PCT_PROFESSIONAL,PCT_RELIGIOUS,COST,PCT_VOCATIONAL
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100654,0,1,12,18,9,14,1,0,0,0,...,0.448163,0.446488,0.422018,0.37379,0.423581,0.5346,0.4074,0.0,18888.0,0.0629
100663,0,1,12,15,8,15,0,0,0,0,...,0.724256,0.770134,0.818059,0.678749,0.751553,0.4379,0.5805,0.0095,19990.0,0.0
100690,0,2,12,21,6,6,0,0,0,0,...,0.597701,0.709091,0.78049,0.613281,0.632653,0.6301,0.1096,0.2603,12300.0,0.0
100706,0,1,12,15,8,12,0,0,0,0,...,0.721404,0.824742,0.851936,0.689788,0.778157,0.2764,0.7152,0.0082,20306.0,0.0
100724,0,1,12,18,9,13,1,0,0,0,...,0.310875,0.376106,0.333333,0.315997,0.315737,0.5449,0.5733,0.0,17400.0,0.0


Everything matches up. Good!


We can use `pandas.get_dummies()` to break out the categorical columns and improve the performace of our cosine similarity metric.

In [6]:
imputed = pd.get_dummies(imputed, columns=['CONTROL', 'LOCALE', 'CCBASIC', 'CCUGPROF', 'CCSIZSET'])

Then, to make sure all features are treated equally, normalize them to the range 0-1.

In [7]:
rescaled = MinMaxScaler().fit_transform(imputed)
imputed[imputed.columns] = rescaled

In [8]:
rescaled.shape

(2473, 124)

# Compute the cosine similarity:

Import from scikit-learn and let it do all the work :-)

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

c = cosine_similarity(rescaled, rescaled)

That returns a square numpy array with the similarity of each school to every other.  Now we need to pipe it back into a dataframe so we can keep everything indexed:

In [10]:
cosim = pd.DataFrame(c, index=data.index, columns=data.index)
cosim['INSTNM'] = data.INSTNM
cosim['ZIP'] = data.ZIP

Rearrange the columns so INSTNM and ZIP are at the front:

In [11]:
cols = cosim.columns.tolist()
cosim = cosim[cols[-2:]+cols[:-2]]

cosim.head()

UNITID,INSTNM,ZIP,100654,100663,100690,100706,100724,100751,100812,100830,...,45891904,45891905,45891906,45891907,45896401,45896402,45897301,45897302,45897303,45897304
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100654,Alabama A & M University,35762,1.0,0.645446,0.559181,0.617786,0.852245,0.535816,0.52081,0.718865,...,0.397937,0.482052,0.482052,0.482052,0.482052,0.482052,0.480283,0.480283,0.482052,0.482052
100663,University of Alabama at Birmingham,35294-0110,0.645446,1.0,0.683236,0.92712,0.555639,0.736936,0.700375,0.82531,...,0.57222,0.568836,0.568836,0.568836,0.568836,0.568836,0.568492,0.568492,0.568836,0.568836
100690,Amridge University,36117-3553,0.559181,0.683236,1.0,0.659358,0.489259,0.600211,0.603639,0.663921,...,0.501795,0.4982,0.4982,0.4982,0.4982,0.4982,0.497286,0.497286,0.4982,0.4982
100706,University of Alabama in Huntsville,35899,0.617786,0.92712,0.659358,1.0,0.525419,0.730869,0.701689,0.885037,...,0.49675,0.493836,0.493836,0.493836,0.493836,0.493836,0.49343,0.49343,0.493836,0.493836
100724,Alabama State University,36104-0271,0.852245,0.555639,0.489259,0.525419,1.0,0.438288,0.450393,0.648715,...,0.353488,0.452585,0.452585,0.452585,0.452585,0.452585,0.450919,0.450919,0.452585,0.452585


# Look at a few examples:

In [12]:
cosim[cosim.INSTNM.str.contains('Harvard')]['INSTNM']

UNITID
166027    Harvard University
Name: INSTNM, dtype: object

In [13]:
cosim[['INSTNM', 166027]].sort_values(166027, ascending=False).head(15)

UNITID,INSTNM,166027
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
166027,Harvard University,1.0
130794,Yale University,0.991754
198419,Duke University,0.984593
166683,Massachusetts Institute of Technology,0.981545
243744,Stanford University,0.95191
190150,Columbia University in the City of New York,0.946907
215062,University of Pennsylvania,0.946676
131496,Georgetown University,0.943269
144050,University of Chicago,0.943169
147767,Northwestern University,0.942208


In [14]:
cosim[cosim.INSTNM.str.contains('Truman')]['INSTNM']

UNITID
178615    Truman State University
Name: INSTNM, dtype: object

In [15]:
cosim[['INSTNM', 178615]].sort_values(178615, ascending=False).head(15)

UNITID,INSTNM,178615
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
178615,Truman State University,1.0
171128,Michigan Technological University,0.928301
171456,Northern Michigan University,0.90899
221768,The University of Tennessee-Martin,0.904439
240329,University of Wisconsin-La Crosse,0.878184
232566,Longwood University,0.87605
240480,University of Wisconsin-Stevens Point,0.874682
174251,University of Minnesota-Morris,0.874286
207865,Southwestern Oklahoma State University,0.872857
174233,University of Minnesota-Duluth,0.868881


In [16]:
cosim[cosim.INSTNM.str.contains('University of Utah')]['INSTNM']

UNITID
230764    University of Utah
Name: INSTNM, dtype: object

In [17]:
cosim[['INSTNM', 230764]].sort_values(230764, ascending=False).head(15)

UNITID,INSTNM,230764
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
230764,University of Utah,1.0
100663,University of Alabama at Birmingham,0.985271
104151,Arizona State University-Tempe,0.928027
209551,University of Oregon,0.927157
132903,University of Central Florida,0.925006
227216,University of North Texas,0.923296
234030,Virginia Commonwealth University,0.922988
137351,University of South Florida-Main Campus,0.922127
100706,University of Alabama in Huntsville,0.91946
134097,Florida State University,0.918518


In [18]:
cosim[cosim.INSTNM.str.contains('University of Texas at Austin')]['INSTNM']

UNITID
228778    The University of Texas at Austin
Name: INSTNM, dtype: object

In [19]:
cosim[['INSTNM', 228778]].sort_values(228778, ascending=False).head(15)

UNITID,INSTNM,228778
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
228778,The University of Texas at Austin,1.0
174066,University of Minnesota-Twin Cities,0.991985
204796,Ohio State University-Main Campus,0.990277
110680,University of California-San Diego,0.942476
236948,University of Washington-Seattle Campus,0.942263
110662,University of California-Los Angeles,0.941085
134097,Florida State University,0.940699
215293,University of Pittsburgh-Pittsburgh Campus,0.931637
181464,University of Nebraska-Lincoln,0.928655
137351,University of South Florida-Main Campus,0.928585


In [20]:
cosim[cosim.INSTNM.str.contains('Missouri University of Science and Technology')]['INSTNM']

UNITID
178411    Missouri University of Science and Technology
Name: INSTNM, dtype: object

In [21]:
cosim[['INSTNM', 178411]].sort_values(178411, ascending=False).head(15)

UNITID,INSTNM,178411
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
178411,Missouri University of Science and Technology,1.0
196185,SUNY Oneonta,0.896345
100858,Auburn University,0.867977
171128,Michigan Technological University,0.861937
190044,Clarkson University,0.851076
196167,SUNY College at Geneseo,0.842339
219356,South Dakota State University,0.842231
240727,University of Wyoming,0.841329
232566,Longwood University,0.840242
176017,University of Mississippi,0.838846


In [22]:
cosim[cosim.INSTNM.str.contains('California Institute of Technology')]['INSTNM']

UNITID
110404    California Institute of Technology
Name: INSTNM, dtype: object

In [23]:
cosim[['INSTNM', 110404]].sort_values(110404, ascending=False).head(15)

UNITID,INSTNM,110404
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
110404,California Institute of Technology,1.0
166683,Massachusetts Institute of Technology,0.933479
217156,Brown University,0.921158
198419,Duke University,0.921051
195030,University of Rochester,0.920014
130794,Yale University,0.919781
130590,Trinity College,0.915531
166027,Harvard University,0.910323
166124,College of the Holy Cross,0.905082
214175,Muhlenberg College,0.901617


In [24]:
cosim[cosim.INSTNM.str.contains('Stanford')]['INSTNM']

UNITID
243744    Stanford University
Name: INSTNM, dtype: object

In [25]:
cosim[['INSTNM', 243744]].sort_values(243744, ascending=False).head(15)

UNITID,INSTNM,243744
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
243744,Stanford University,1.0
179867,Washington University in St Louis,0.985871
139658,Emory University,0.984868
152080,University of Notre Dame,0.982777
130794,Yale University,0.956181
215062,University of Pennsylvania,0.954169
198419,Duke University,0.953662
190415,Cornell University,0.952936
166027,Harvard University,0.95191
144050,University of Chicago,0.951377


In [26]:
cosim[cosim.INSTNM.str.contains('Brigham Young')]['INSTNM']

UNITID
142522     Brigham Young University-Idaho
230038     Brigham Young University-Provo
230047    Brigham Young University-Hawaii
Name: INSTNM, dtype: object

In [27]:
cosim[['INSTNM', 230038]].sort_values(230038, ascending=False).head(15)

UNITID,INSTNM,230038
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
230038,Brigham Young University-Provo,1.0
199847,Wake Forest University,0.922102
165334,Clark University,0.919623
223232,Baylor University,0.916981
202480,University of Dayton,0.912999
196413,Syracuse University,0.910909
236328,University of Puget Sound,0.867318
130590,Trinity College,0.866908
213543,Lehigh University,0.866369
214175,Muhlenberg College,0.866092


In [28]:
cosim[cosim.INSTNM.str.contains('Calvin College')]

UNITID,INSTNM,ZIP,100654,100663,100690,100706,100724,100751,100812,100830,...,45891904,45891905,45891906,45891907,45896401,45896402,45897301,45897302,45897303,45897304
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
169080,Calvin College,49546,0.616807,0.730682,0.787437,0.723308,0.452493,0.683874,0.638637,0.690684,...,0.467665,0.463166,0.463166,0.463166,0.463166,0.463166,0.462373,0.462373,0.463166,0.463166


In [29]:
cosim[['INSTNM', 169080]].sort_values(169080, ascending=False).head(15)

UNITID,INSTNM,169080
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
169080,Calvin College,1.0
233374,University of Richmond,0.967149
153269,Drake University,0.93871
217402,Providence College,0.936098
236328,University of Puget Sound,0.930985
153144,Coe College,0.929162
175980,Millsaps College,0.928683
210401,Willamette University,0.928147
170301,Hope College,0.928019
195030,University of Rochester,0.92693


In [30]:
cosim[cosim.INSTNM.str.contains('Spark')]

UNITID,INSTNM,ZIP,100654,100663,100690,100706,100724,100751,100812,100830,...,45891904,45891905,45891906,45891907,45896401,45896402,45897301,45897302,45897303,45897304
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21997601,Spark,370674896,0.530368,0.724319,0.728839,0.72157,0.439184,0.672018,0.629783,0.668045,...,0.46743,0.464193,0.464193,0.464193,0.464193,0.464193,0.46376,0.46376,0.464193,0.464193


In [31]:
cosim[['INSTNM', 21997601]].sort_values(21997601, ascending=False).head(15)

UNITID,INSTNM,21997601
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
21997601,Spark,1.0
459736,Touro University California,0.932062
176619,Assemblies of God Theological Seminary,0.931281
175980,Millsaps College,0.928421
153144,Coe College,0.926708
150534,University of Evansville,0.923899
236328,University of Puget Sound,0.923831
224323,University of Dallas,0.92171
210401,Willamette University,0.916698
133492,Eckerd College,0.916005


In [32]:
cosim[cosim.INSTNM.str.contains('Lipscomb')]

UNITID,INSTNM,ZIP,100654,100663,100690,100706,100724,100751,100812,100830,...,45891904,45891905,45891906,45891907,45896401,45896402,45897301,45897302,45897303,45897304
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
219976,Lipscomb University,37204-3951,0.615736,0.659798,0.658293,0.654701,0.453526,0.728361,0.627792,0.681175,...,0.550607,0.625308,0.625308,0.625308,0.625308,0.625308,0.624928,0.624928,0.625308,0.625308


In [33]:
cosim[['INSTNM', 219976]].sort_values(219976, ascending=False).head(15)

UNITID,INSTNM,219976
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
219976,Lipscomb University,1.0
173045,Augsburg College,0.986522
215770,Saint Joseph's University,0.969583
137847,The University of Tampa,0.9292
121309,Point Loma Nazarene University,0.927862
16822702,Wentworth Institute of Technology,0.926558
228149,St Mary's University,0.924573
107044,Harding University,0.924372
181002,Creighton University,0.923619
152600,Valparaiso University,0.923612


### LEAST similar schools to Harvard:

In [34]:
cosim[['INSTNM', 439279]].sort_values(439279, ascending=False).head(30)

UNITID,INSTNM,439279
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
439279,University of Phoenix-St Louis Campus,1.0
450456,University of Phoenix-Birmingham Campus,0.894882
474960,University of Phoenix-Lafayette Campus,0.894289
450483,University of Phoenix-Washington DC Campus,0.893759
442161,University of Phoenix-Chicago Campus,0.893211
448567,University of Phoenix-Columbia Campus,0.892975
440448,University of Phoenix-Houston Campus,0.892088
448822,University of Phoenix-Fairfield County Campus,0.891135
443924,University of Phoenix-Little Rock Campus,0.890514
474951,University of Phoenix-Baton Rouge Campus,0.889546


## I'm amazed how much better a job this is doing just splitting out the categorical variables!!!

# Finally, save to csv.

In [35]:
cosim.to_csv('data/similarity_index.csv')