## University Ranking Predictions

In [77]:
%matplotlib inline
import pandas as pd
import numpy as np
shanghai_data = pd.read_csv("data/shanghaiData.csv")
shanghai_data = shanghai_data[shanghai_data.total_score.notnull()]
shanghai_data.head()

Unnamed: 0,world_rank,university_name,national_rank,total_score,alumni,award,hici,ns,pub,pcp,year
0,1,Harvard University,1,100.0,100.0,100.0,100.0,100.0,100.0,72.4,2005
1,2,University of Cambridge,1,73.6,99.8,93.4,53.3,56.6,70.9,66.9,2005
2,3,Stanford University,2,73.4,41.1,72.2,88.5,70.9,72.3,65.0,2005
3,4,"University of California, Berkeley",3,72.8,71.8,76.0,69.4,73.9,72.2,52.7,2005
4,5,Massachusetts Institute of Technology (MIT),4,70.1,74.0,80.6,66.7,65.8,64.3,53.0,2005


In [78]:
prediction_data = shanghai_data.loc[:,['university_name', 'national_rank',
       'total_score', 'alumni', 'award', 'hici', 'ns', 'pub', 'pcp', 'year']]
for index, row in prediction_data.iterrows():
    score = prediction_data[(prediction_data.university_name == row.university_name) & (prediction_data.year == row.year+1)].total_score.values
    value = np.nan if len(score) == 0 else score[0]
    prediction_data.loc[index,'next_total_score'] = value
last_year_filter = prediction_data.apply(lambda row : np.isnan(row.next_score),axis=1) 
prediction_data = prediction_data[~last_year_filter]
prediction_data.head()

Unnamed: 0,university_name,national_rank,total_score,alumni,award,hici,ns,pub,pcp,year,next_score
0,Harvard University,1,100.0,100.0,100.0,100.0,100.0,100.0,72.4,2005,100.0
1,University of Cambridge,1,73.6,99.8,93.4,53.3,56.6,70.9,66.9,2005,72.6
2,Stanford University,2,73.4,41.1,72.2,88.5,70.9,72.3,65.0,2005,72.5
3,"University of California, Berkeley",3,72.8,71.8,76.0,69.4,73.9,72.2,52.7,2005,72.1
4,Massachusetts Institute of Technology (MIT),4,70.1,74.0,80.6,66.7,65.8,64.3,53.0,2005,69.7


In [79]:
X = prediction_data.loc[:,['total_score', 'national_rank', 'alumni', 'award', 'hici', 'ns', 'pub', 'pcp', 'year']]
y = np.ravel(prediction_data.loc[:,['next_score']])
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [80]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [81]:
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
clf = LinearRegression()
clf.fit(X_train, y_train)
cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
cv_scores

array([ 0.99587734,  0.99691374,  0.99622141,  0.99441249,  0.99762031])

## Using Last Two Years

In [82]:
prediction_data_two_years = shanghai_data.loc[:,['total_score', 'university_name', 'national_rank', 'alumni', 'award', 'hici', 'ns', 'pub', 'pcp', 'year']]
prediction_data_two_years.head()

Unnamed: 0,total_score,university_name,national_rank,alumni,award,hici,ns,pub,pcp,year
0,100.0,Harvard University,1,100.0,100.0,100.0,100.0,100.0,72.4,2005
1,73.6,University of Cambridge,1,99.8,93.4,53.3,56.6,70.9,66.9,2005
2,73.4,Stanford University,2,41.1,72.2,88.5,70.9,72.3,65.0,2005
3,72.8,"University of California, Berkeley",3,71.8,76.0,69.4,73.9,72.2,52.7,2005
4,70.1,Massachusetts Institute of Technology (MIT),4,74.0,80.6,66.7,65.8,64.3,53.0,2005


In [102]:
prediction_data_two_years = prediction_data_two_years[prediction_data_two_years.university_name == 'University of Cambridge']
prediction_data_two_years.head()

Unnamed: 0,total_score,university_name,national_rank,alumni,award,hici,ns,pub,pcp,year,...,pub_back_1,pcp_back_1,total_score_back_2,national_rank_back_2,alumni_back_2,award_back_2,hici_back_2,ns_back_2,pub_back_2,pcp_back_2
1,73.6,University of Cambridge,1,99.8,93.4,53.3,56.6,70.9,66.9,2005,...,,,,,,,,,,
501,72.6,University of Cambridge,1,96.3,91.5,53.8,59.5,67.1,66.5,2006,...,,,,,,,,,,
1003,71.6,University of Cambridge,1,93.6,91.5,54.0,58.2,65.4,65.1,2007,...,67.1,66.5,73.6,1.0,99.8,93.4,53.3,56.6,70.9,66.9
1513,70.4,University of Cambridge,1,90.3,91.5,53.6,56.0,64.1,65.0,2008,...,65.4,65.1,72.6,1.0,96.3,91.5,53.8,59.5,67.1,66.5
2016,70.2,University of Cambridge,1,89.4,91.5,53.8,53.9,65.4,65.5,2009,...,64.1,65.0,71.6,1.0,93.6,91.5,54.0,58.2,65.4,65.1


In [100]:
years_back = [1, 2]
features = ['total_score', 'national_rank', 'alumni', 'award', 'hici', 'ns', 'hici', 'pub', 'pcp']
for index, row in prediction_data_two_years.iterrows():
    is_complete = True
    past_rows = {}
    for year in years_back:
        past_row = prediction_data_two_years[(prediction_data_two_years.university_name == row.university_name) & (prediction_data_two_years.year == row.year-year)]
        past_rows[year] = past_row
        if len(past_row.values) == 0:
            is_complete = False
    
    if (is_complete):
        for year in years_back:
            past_row = past_rows[year]
            for feature in features:
                prediction_data_two_years.loc[index, feature + '_back_' + str(year)] = past_row[feature].values[0]

# TODO fix lambda
complete_filter = prediction_data_two_years.apply(lambda row : np.isnan(row.total_score_back_1) & np.isnan(row.total_score_back_2),axis=1) 
prediction_data_two_years_clean = prediction_data_two_years[~complete_filter]
prediction_data_two_years_clean

Unnamed: 0,total_score,university_name,national_rank,alumni,award,hici,ns,pub,pcp,year,...,pub_back_1,pcp_back_1,total_score_back_2,national_rank_back_2,alumni_back_2,award_back_2,hici_back_2,ns_back_2,pub_back_2,pcp_back_2
1003,71.6,University of Cambridge,1.0,93.6,91.5,54.0,58.2,65.4,65.1,2007,...,67.1,66.5,73.6,1.0,99.8,93.4,53.3,56.6,70.9,66.9
1513,70.4,University of Cambridge,1.0,90.3,91.5,53.6,56.0,64.1,65.0,2008,...,65.4,65.1,72.6,1.0,96.3,91.5,53.8,59.5,67.1,66.5
2016,70.2,University of Cambridge,1.0,89.4,91.5,53.8,53.9,65.4,65.5,2009,...,64.1,65.0,71.6,1.0,93.6,91.5,54.0,58.2,65.4,65.1
2518,69.6,University of Cambridge,1.0,88.5,92.6,53.9,54.3,65.7,53.1,2010,...,65.4,65.5,70.4,1.0,90.3,91.5,53.6,56.0,64.1,65.0
3018,70.0,University of Cambridge,1.0,87.1,96.7,54.5,54.1,65.1,52.0,2011,...,65.7,53.1,70.2,1.0,89.4,91.5,53.8,53.9,65.4,65.5
3518,69.8,University of Cambridge,1.0,80.3,97.2,54.6,55.0,65.9,52.1,2012,...,65.1,52.0,69.6,1.0,88.5,92.6,53.9,54.3,65.7,53.1
3802,69.6,University of Cambridge,1.0,79.1,97.3,54.6,54.0,66.2,53.3,2013,...,65.9,52.1,70.0,1.0,87.1,96.7,54.5,54.1,65.1,52.0
3901,69.2,University of Cambridge,1.0,79.1,96.6,50.8,56.2,66.5,55.2,2014,...,66.2,53.3,69.8,1.0,80.3,97.2,54.6,55.0,65.9,52.1
4401,68.8,University of Cambridge,1.0,77.1,96.6,50.8,55.6,66.4,55.8,2015,...,66.5,55.2,69.6,1.0,79.1,97.3,54.6,54.0,66.2,53.3


In [None]:
to_predict = ['total_score', 'national_rank', 'alumni', 'award', 'hici', 'ns', 'hici', 'pub', 'pcp', 'year']
for feature in features:
    for year in years_back:
        features.append(feature + '_back_' + str(year))
features.append('year')
X = prediction_data_two_years_clean.loc[:,features]
#X = prediction_data_two_years_clean.loc[:,['national_rank']]

y = np.ravel(prediction_data_two_years_clean.loc[:,['next_total_score']])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
clf = LinearRegression()
clf.fit(X_train, y_train)
cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
cv_scores
# Just going to use Linear for now to save time