In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
import hdbscan
import seaborn as sns
sns.set_style("dark")



In [2]:
pd.set_option('max_colwidth', 1000)

# Load Data

Here, I'm reading in the imputed data set Julia put together, and pulling in some metadata from the non-imputed data.

In [3]:
data = pd.read_csv('data/scorecard_reduced_features.csv')
imputed = pd.read_csv('data/scorecard_imputed.csv')

Then, to make sure all features are treated equally, normalize them to the range 0-1.

In [4]:
rescaled = MinMaxScaler().fit_transform(imputed)
data[imputed.columns] = rescaled

Set the index of the resulting dataset to the UNITID, and inspect the dataframe:

In [5]:
data.set_index('UNITID', inplace=True)
data.head()

Unnamed: 0_level_0,INSTNM,ZIP,HCM2,CONTROL,LOCALE,CCBASIC,HBCU,PBI,ANNHI,TRIBAL,...,LO_INC_RPY_3YR_RT_SUPP,MD_INC_RPY_3YR_RT_SUPP,HI_INC_RPY_3YR_RT_SUPP,NONCOM_RPY_3YR_RT_SUPP,FIRSTGEN_RPY_3YR_RT_SUPP,PCT_LIBERAL_ARTS,PCT_VOCATIONAL,PCT_RELIGIOUS,COST,PCT_PROFESSIONAL
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100654,Alabama A & M University,35762,0.0,0.0,0.03125,0.53125,1.0,0.0,0.0,0.0,...,0.388809,0.334182,0.291972,0.293883,0.342166,0.5346,0.0629,1.390554e-16,0.191513,0.2037
100663,University of Alabama at Birmingham,35294-0110,0.0,0.0,0.03125,0.4375,0.0,0.0,0.0,0.0,...,0.694598,0.723495,0.777123,0.637756,0.716461,0.4379,4.024558e-18,0.0095,0.208693,0.29025
100690,Amridge University,36117-3553,0.0,0.5,0.03125,0.625,0.0,0.0,0.0,0.0,...,0.554431,0.650066,0.74038,0.563934,0.580768,0.6301,4.024558e-18,0.2603,0.088812,0.0548
100706,University of Alabama in Huntsville,35899,0.0,0.0,0.03125,0.4375,0.0,0.0,0.0,0.0,...,0.691439,0.789183,0.818622,0.650204,0.746823,0.2764,4.024558e-18,0.0082,0.213619,0.3576
100724,Alabama State University,36104-0271,0.0,0.0,0.03125,0.53125,1.0,0.0,0.0,0.0,...,0.236754,0.249519,0.183333,0.228715,0.21909,0.5449,4.024558e-18,1.390554e-16,0.168317,0.28665


# Compute the cosine similarity:

Import from scikit-learn and let it do all the work :-)

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

c = cosine_similarity(rescaled, rescaled)

That returns a square numpy array with the similarity of each school to every other.  Now we need to pipe it back into a dataframe so we can keep everything indexed:

In [7]:
cosim = pd.DataFrame(c, index=data.index, columns=data.index)
cosim['INSTNM'] = data.INSTNM
cosim['ZIP'] = data.ZIP

Rearrange the columns so INSTNM and ZIP are at the front:

In [8]:
cols = cosim.columns.tolist()
cosim = cosim[cols[-2:]+cols[:-2]]

cosim.head()

UNITID,INSTNM,ZIP,100654,100663,100690,100706,100724,100751,100812,100830,...,45891904,45891905,45891906,45891907,45896401,45896402,45897301,45897302,45897303,45897304
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100654,Alabama A & M University,35762,1.0,0.78633,0.760208,0.747198,0.903731,0.723629,0.717689,0.799929,...,0.658874,0.658874,0.658874,0.658874,0.658874,0.658874,0.658874,0.658874,0.658874,0.658874
100663,University of Alabama at Birmingham,35294-0110,0.78633,1.0,0.915292,0.988881,0.681336,0.983041,0.936378,0.948351,...,0.751214,0.751214,0.751214,0.751214,0.751214,0.751214,0.751214,0.751214,0.751214,0.751214
100690,Amridge University,36117-3553,0.760208,0.915292,1.0,0.886403,0.709962,0.885201,0.903969,0.909588,...,0.844421,0.844421,0.844421,0.844421,0.844421,0.844421,0.844421,0.844421,0.844421,0.844421
100706,University of Alabama in Huntsville,35899,0.747198,0.988881,0.886403,1.0,0.632737,0.981303,0.945429,0.934215,...,0.759726,0.759726,0.759726,0.759726,0.759726,0.759726,0.759726,0.759726,0.759726,0.759726
100724,Alabama State University,36104-0271,0.903731,0.681336,0.709962,0.632737,1.0,0.618028,0.642089,0.70576,...,0.664862,0.664862,0.664862,0.664862,0.664862,0.664862,0.664862,0.664862,0.664862,0.664862


# Look at a few examples:

In [9]:
cosim[cosim.INSTNM.str.contains('Harvard')]['INSTNM']

UNITID
166027    Harvard University
Name: INSTNM, dtype: object

In [10]:
cosim[['INSTNM', 166027]].sort_values(166027, ascending=False).head(15)

UNITID,INSTNM,166027
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
166027,Harvard University,1.0
130794,Yale University,0.990052
243744,Stanford University,0.987735
190150,Columbia University in the City of New York,0.984804
215062,University of Pennsylvania,0.983386
198419,Duke University,0.981275
131496,Georgetown University,0.979903
144050,University of Chicago,0.979379
147767,Northwestern University,0.978448
190415,Cornell University,0.978352


In [11]:
cosim[cosim.INSTNM.str.contains('Truman')]['INSTNM']

UNITID
178615    Truman State University
Name: INSTNM, dtype: object

In [12]:
cosim[['INSTNM', 178615]].sort_values(178615, ascending=False).head(15)

UNITID,INSTNM,178615
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
178615,Truman State University,1.0
197869,Appalachian State University,0.993833
207388,Oklahoma State University-Main Campus,0.992497
176017,University of Mississippi,0.991404
176080,Mississippi State University,0.989908
23693902,Washington State University-Spokane,0.988417
139861,Georgia College and State University,0.9884
236939,Washington State University,0.986461
174251,University of Minnesota-Morris,0.986216
163912,St Mary's College of Maryland,0.985581


In [13]:
cosim[cosim.INSTNM.str.contains('University of Utah')]['INSTNM']

UNITID
230764    University of Utah
Name: INSTNM, dtype: object

In [14]:
cosim[['INSTNM', 230764]].sort_values(230764, ascending=False).head(15)

UNITID,INSTNM,230764
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
230764,University of Utah,1.0
209542,Oregon State University,0.99188
155317,University of Kansas,0.991545
182290,University of Nevada-Reno,0.991231
181464,University of Nebraska-Lincoln,0.990149
200280,University of North Dakota,0.990056
106397,University of Arkansas,0.98994
126818,Colorado State University-Fort Collins,0.989721
136172,University of North Florida,0.989581
157085,University of Kentucky,0.989494


In [15]:
cosim[cosim.INSTNM.str.contains('University of Texas at Austin')]['INSTNM']

UNITID
228778    The University of Texas at Austin
Name: INSTNM, dtype: object

In [16]:
cosim[['INSTNM', 228778]].sort_values(228778, ascending=False).head(15)

UNITID,INSTNM,228778
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
228778,The University of Texas at Austin,1.0
134130,University of Florida,0.99427
170976,University of Michigan-Ann Arbor,0.994043
234076,University of Virginia-Main Campus,0.992805
236948,University of Washington-Seattle Campus,0.992653
228723,Texas A & M University-College Station,0.991997
134097,Florida State University,0.991862
196079,SUNY at Binghamton,0.991292
110680,University of California-San Diego,0.991271
199120,University of North Carolina at Chapel Hill,0.991104


In [17]:
cosim[cosim.INSTNM.str.contains('California Institute of Technology')]['INSTNM']

UNITID
110404    California Institute of Technology
Name: INSTNM, dtype: object

In [18]:
cosim[['INSTNM', 110404]].sort_values(110404, ascending=False).head(15)

UNITID,INSTNM,110404
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1
110404,California Institute of Technology,1.0
166683,Massachusetts Institute of Technology,0.972223
227757,Rice University,0.971557
211440,Carnegie Mellon University,0.971167
115409,Harvey Mudd College,0.970876
186131,Princeton University,0.966013
167358,Northeastern University,0.965728
110635,University of California-Berkeley,0.965042
112260,Claremont McKenna College,0.964793
207971,University of Tulsa,0.964416


## Looks like this is doing a decent job.  Not amazing, as there are some possible oddities (U. of Tulsa is the 10th most similar to Caltech!?!?), but pretty decent. 

Maybe those two really are quite similar, and Tulsa just gets a bad rap ;-)

# Finally, save to csv.

In [19]:
cosim.to_csv('data/similarity_index.csv')