## University Ranking Predictions

In [201]:
%matplotlib inline
import pandas as pd
import numpy as np
shanghai_data = pd.read_csv("data/shanghaiData.csv")
shanghai_data = shanghai_data[shanghai_data.total_score.notnull()]
shanghai_data.head()
shanghai_data[0:20]

Unnamed: 0,world_rank,university_name,national_rank,total_score,alumni,award,hici,ns,pub,pcp,year
0,1,Harvard University,1,100.0,100.0,100.0,100.0,100.0,100.0,72.4,2005
1,2,University of Cambridge,1,73.6,99.8,93.4,53.3,56.6,70.9,66.9,2005
2,3,Stanford University,2,73.4,41.1,72.2,88.5,70.9,72.3,65.0,2005
3,4,"University of California, Berkeley",3,72.8,71.8,76.0,69.4,73.9,72.2,52.7,2005
4,5,Massachusetts Institute of Technology (MIT),4,70.1,74.0,80.6,66.7,65.8,64.3,53.0,2005
5,6,California Institute of Technology,5,67.1,59.2,68.6,59.8,65.8,52.5,100.0,2005
6,7,Columbia University,6,62.3,79.4,60.6,56.1,54.2,69.5,45.4,2005
7,8,Princeton University,7,60.9,63.4,76.8,60.9,48.7,48.5,59.1,2005
8,9,University of Chicago,8,60.1,75.6,81.9,50.3,44.7,56.4,42.2,2005
9,10,University of Oxford,2,59.7,64.3,59.1,48.4,55.6,68.4,53.2,2005


In [202]:
prediction_data = shanghai_data.loc[:,['total_score', 'university_name', 'national_rank', 'alumni', 'award', 'hici', 'ns', 'pub', 'pcp', 'year']]

In [203]:
universities_to_use = ['Harvard University', 'University of Cambridge', 
                       'Stanford University', 'University of California, Berkeley', 
                       'Massachusetts Institute of Technology (MIT)', 'California Institute of Technology',
                       'Columbia University', 'Princeton University',
                       'University of Chicago', 'University of Oxford',
                       'Yale University', 'Cornell University',
                       'University of California, San Diego', 'University of California, Los Angeles',
                       'University of Pennsylvania', 'University of Wisconsin - Madison',
                       'University of Washington', 'University of California, San Francisco',
                       'The Johns Hopkins University', 'The University of Tokyo'
                          ]
prediction_data = prediction_data[prediction_data.university_name.isin(universities_to_use)]

## Using Multiple Past Years

In [204]:
years_back = [1, 2, 3]
features = ['total_score', 'national_rank', 'alumni', 'award', 'hici', 'ns', 'hici', 'pub', 'pcp', 'year']
for index, row in prediction_data.iterrows():
    is_complete = True
    past_rows = {}
    for year in years_back:
        past_row = prediction_data[(prediction_data.university_name == row.university_name) & (prediction_data.year == row.year-year)]
        past_rows[year] = past_row
        if len(past_row.values) == 0:
            is_complete = False
    
    if (is_complete):
        for year in years_back:
            past_row = past_rows[year]
            for feature in features:
                prediction_data.loc[index, feature + '_back_' + str(year)] = past_row[feature].values[0]

# TODO fix lambda
complete_filter = prediction_data.apply(lambda row : np.isnan(row['total_score_back_' + str(len(years_back))]),axis=1) 
prediction_data_clean = prediction_data[~complete_filter]
prediction_data_clean.head()

Unnamed: 0,total_score,university_name,national_rank,alumni,award,hici,ns,pub,pcp,year,...,year_back_2,total_score_back_3,national_rank_back_3,alumni_back_3,award_back_3,hici_back_3,ns_back_3,pub_back_3,pcp_back_3,year_back_3
1510,100.0,Harvard University,1,100.0,100.0,100.0,100.0,100.0,74.1,2008,...,2006.0,100.0,1,100.0,100.0,100.0,100.0,100.0,72.4,2005.0
1511,73.7,Stanford University,2,40.0,78.7,86.6,68.9,71.6,66.9,2008,...,2006.0,73.4,2,41.1,72.2,88.5,70.9,72.3,65.0,2005.0
1512,71.4,"University of California, Berkeley",3,69.0,77.1,68.8,70.6,70.0,53.0,2008,...,2006.0,72.8,3,71.8,76.0,69.4,73.9,72.2,52.7,2005.0
1513,70.4,University of Cambridge,1,90.3,91.5,53.6,56.0,64.1,65.0,2008,...,2006.0,73.6,1,99.8,93.4,53.3,56.6,70.9,66.9,2005.0
1514,69.6,Massachusetts Institute of Technology (MIT),4,71.0,80.6,65.6,68.7,61.6,53.9,2008,...,2006.0,70.1,4,74.0,80.6,66.7,65.8,64.3,53.0,2005.0


In [205]:
to_predict = ['total_score', 'national_rank', 'alumni', 'award', 'hici', 'ns', 'hici', 'pub', 'pcp', 'year']
features = []
for feature in to_predict:
    for year in years_back:
        features.append(feature + '_back_' + str(year))

In [206]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
X = prediction_data_clean.loc[:,features]
# Cheating a bit but scale everything together
# Dont do for now, not a huge deal and dont know how to work with scores
#scaler = StandardScaler()
#scaler.fit(X_train)
#X = scaler.transform(X)

In [207]:
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
clf_map = {}
# Just going to use Linear for now to save time
# Don't know how to split training set properly here to save test set
for feature in to_predict:
    clf = LinearRegression()
    y = np.ravel(prediction_data_clean.loc[:,[feature]])
    clf.fit(X, y)
    clf_map[feature] = clf
    cv_scores = cross_val_score(clf, X, y, cv=5)
    print (cv_scores)

[ 0.99445793  0.99713959  0.99474322  0.99647456  0.99285132]
[ 0.9932103   0.98896187  0.98257863  0.98990346  0.98068029]
[ 0.99180274  0.99568972  0.98665561  0.99670775  0.98974022]
[ 0.99777309  0.99639752  0.99711292  0.99534411  0.99007751]
[ 0.97547248  0.98672715  0.97750977  0.94646014  0.91085891]
[ 0.97408061  0.978302    0.97245334  0.98488654  0.8654687 ]
[ 0.97547248  0.98672715  0.97750977  0.94646014  0.91085891]
[ 0.98326928  0.98300697  0.9783927   0.97750603  0.9623903 ]
[ 0.94954107  0.82642713  0.98589749  0.99143711  0.95334444]
[ 1.  1.  1.  1.  1.]


In [208]:
prediction_data_next = prediction_data_clean
school_name_to_predict = ['Princeton University']
university_data = prediction_data_next[(prediction_data_next.university_name.isin(school_name_to_predict)) & (prediction_data_next.year == 2008)]
X = university_data.loc[:,features]
clf_map['total_score'].predict(X)
X

Unnamed: 0,total_score_back_1,total_score_back_2,total_score_back_3,national_rank_back_1,national_rank_back_2,national_rank_back_3,alumni_back_1,alumni_back_2,alumni_back_3,award_back_1,...,hici_back_3,pub_back_1,pub_back_2,pub_back_3,pcp_back_1,pcp_back_2,pcp_back_3,year_back_1,year_back_2,year_back_3
1517,59.5,58.6,60.9,7,7,7,62.3,61.1,63.4,80.4,...,60.9,46.5,47.3,48.5,58.9,58.0,59.1,2007.0,2006.0,2005.0


In [209]:
scores = [X['total_score_back_3'], X['total_score_back_2'], X['total_score_back_1']]
print (scores)
X_continue = X
for i in range(7):
    X_original = X_continue
    for feature in to_predict:
        X_continue[feature + '_back_1'] = clf_map[feature].predict(X_original)
        X_continue[feature + '_back_2'] = X_original[feature + '_back_1']
    scores.append(X_continue['total_score_back_1'])
X_continue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[1517    60.9
Name: total_score_back_3, dtype: float64, 1517    58.6
Name: total_score_back_2, dtype: float64, 1517    59.5
Name: total_score_back_1, dtype: float64]


Unnamed: 0,total_score_back_1,total_score_back_2,total_score_back_3,national_rank_back_1,national_rank_back_2,national_rank_back_3,alumni_back_1,alumni_back_2,alumni_back_3,award_back_1,...,hici_back_3,pub_back_1,pub_back_2,pub_back_3,pcp_back_1,pcp_back_2,pcp_back_3,year_back_1,year_back_2,year_back_3
1517,54.381947,54.381947,60.9,-2.83913,-2.83913,7,85.47694,85.47694,63.4,118.921215,...,60.9,-37.741144,-37.741144,48.5,107.236272,107.236272,59.1,2010.736626,2010.736626,2005.0


In [210]:
scores

[1517    60.9
 Name: total_score_back_3, dtype: float64, 1517    58.6
 Name: total_score_back_2, dtype: float64, 1517    59.5
 Name: total_score_back_1, dtype: float64, 1517    59.020415
 Name: total_score_back_1, dtype: float64, 1517    56.57833
 Name: total_score_back_1, dtype: float64, 1517    55.688631
 Name: total_score_back_1, dtype: float64, 1517    55.650679
 Name: total_score_back_1, dtype: float64, 1517    55.427518
 Name: total_score_back_1, dtype: float64, 1517    54.901148
 Name: total_score_back_1, dtype: float64, 1517    54.381947
 Name: total_score_back_1, dtype: float64]

In [198]:
prediction_data_next[prediction_data_next.university_name.isin(school_name_to_predict)]

Unnamed: 0,total_score,university_name,national_rank,alumni,award,hici,ns,pub,pcp,year,...,year_back_2,total_score_back_3,national_rank_back_3,alumni_back_3,award_back_3,hici_back_3,ns_back_3,pub_back_3,pcp_back_3,year_back_3
1517,58.9,Princeton University,7.0,59.3,80.4,61.9,40.5,44.8,59.3,2008,...,2006.0,60.9,7,63.4,76.8,60.9,48.7,48.5,59.1,2005.0
2020,60.2,Princeton University,7.0,57.8,85.2,61.6,41.5,45.7,61.4,2009,...,2007.0,58.6,7,61.1,75.3,59.6,43.5,47.3,58.0,2006.0
2520,60.8,Princeton University,6.0,56.4,84.8,61.1,43.3,44.3,65.5,2010,...,2008.0,59.5,7,62.3,80.4,59.3,42.9,46.5,58.9,2007.0
3020,61.2,Princeton University,6.0,56.7,87.1,62.1,43.8,43.4,64.2,2011,...,2009.0,58.9,7,59.3,80.4,61.9,40.5,44.8,59.3,2008.0
3520,62.1,Princeton University,6.0,52.3,91.3,62.2,44.4,44.5,66.3,2012,...,2010.0,60.2,7,57.8,85.2,61.6,41.5,45.7,61.4,2009.0
3804,61.9,Princeton University,6.0,52.9,89.2,62.2,45.8,44.0,66.9,2013,...,2011.0,60.8,6,56.4,84.8,61.1,43.3,44.3,65.5,2010.0
3902,60.7,Princeton University,5.0,52.1,88.5,57.1,46.2,44.2,68.1,2014,...,2012.0,61.2,6,56.7,87.1,62.1,43.8,43.4,64.2,2011.0
4402,61.0,Princeton University,5.0,53.3,93.4,57.1,43.0,42.4,70.3,2015,...,2013.0,62.1,6,52.3,91.3,62.2,44.4,44.5,66.3,2012.0


In [None]:
X['total_score_back_2']