# Analysis of Success in Higher Education

In [None]:
pip install plotly==4.2.1

Collecting plotly==4.2.1
[?25l  Downloading https://files.pythonhosted.org/packages/f7/05/3c32c6bc85acbd30a18fbc3ba732fed5e48e5f8fd60d2a148877970f4a61/plotly-4.2.1-py2.py3-none-any.whl (7.2MB)
[K     |████████████████████████████████| 7.2MB 4.9MB/s 
Installing collected packages: plotly
  Found existing installation: plotly 4.1.1
    Uninstalling plotly-4.1.1:
      Successfully uninstalled plotly-4.1.1
Successfully installed plotly-4.2.1


In [None]:
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

In [None]:
import numpy as np
from numpy import set_printoptions
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
from wordcloud import WordCloud
from sklearn.preprocessing import Binarizer
%matplotlib inline
from sklearn import datasets, svm, metrics,model_selection
from sklearn import linear_model

In [None]:
timesData = pd.read_csv("http://lukeholman.net/datasets/timesData.csv")
timesData_v2 = pd.read_csv("http://lukeholman.net/datasets/timesData_v2.csv",thousands=',')
preProcessedTimesData = pd.read_csv("http://lukeholman.net/datasets/preProcessedTimesData.csv")
percentage_people_below_poverty_level = pd.read_csv('http://lukeholman.net/datasets/PercentagePeopleBelowPovertyLevel.csv', encoding="windows-1252")
percent_over_25_completed_highSchool = pd.read_csv('http://lukeholman.net/datasets/PercentOver25CompletedHighSchool.csv', encoding="windows-1252")
timesData_v2.head()

Unnamed: 0,year,world_rank_bin,world_rank_bin_num,university_name,country,teaching,international,research,citations,income,num_students,student_staff_ratio,international_students,female_percentage,world_rank
0,2016,601-800,10,American University of Sharjah,United Arab Emirates,12.4,95.6,10.6,13.3,33.3,5226,14.1,0.82,0.48,601-800
1,2016,251-300,5,Royal College of Surgeons in Ireland,Republic of Ireland,29.9,90.1,20.1,65.3,33.9,2473,15.6,0.63,0.55,251-300
2,2015,351-400,7,Royal College of Surgeons in Ireland,Republic of Ireland,24.7,82.6,18.9,43.5,28.4,2473,15.6,0.63,0.55,351-400
3,2016,1-50,0,Ã‰cole Polytechnique FÃ©dÃ©rale de Lausanne,Switzerland,61.3,98.6,67.5,94.6,65.4,9666,10.5,0.54,0.27,31
4,2013,1-50,0,Ã‰cole Polytechnique FÃ©dÃ©rale de Lausanne,Switzerland,62.4,98.8,57.0,95.0,49.8,9666,10.5,0.54,0.27,40


In [None]:
print('dtypes\n\n',timesData_v2.dtypes)

dtypes

 year                        int64
world_rank_bin             object
world_rank_bin_num          int64
university_name            object
country                    object
teaching                  float64
international             float64
research                  float64
citations                 float64
income                    float64
num_students                int64
student_staff_ratio       float64
international_students    float64
female_percentage         float64
world_rank                 object
dtype: object


In [None]:
y=timesData_v2.world_rank_bin_num.values

In [None]:
X=timesData_v2[['teaching','international','research','citations','income','num_students','student_staff_ratio','international_students','female_percentage']].values

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, stratify=y, test_size=0.20)

In [None]:
timesData.drop(columns=['world_rank_bin', 'university_name', 'country', 'total_score', 'female_male_ratio', 'female_percentage'], inplace=True)
timesData.head()

Unnamed: 0,year,teaching,international,research,citations,income,num_students,student_staff_ratio,international_students,world_rank
0,2011,99.7,72.4,98.7,98.8,34.5,20152,8.9,0.25,1
1,2011,97.7,54.6,98.0,99.9,83.7,2243,6.9,0.27,2
2,2011,97.8,82.3,91.4,99.9,87.5,11074,9.0,0.33,3
3,2013,96.3,59.8,99.4,99.7,95.6,2243,6.9,0.27,1
4,2016,95.6,64.0,97.6,99.8,97.8,2243,6.9,0.27,1


Convert string values and world rank to numeric with the rest converted to ***NaN***. 

In [None]:
timesData['world_rank'] = pd.to_numeric(timesData['world_rank'], errors='coerce')

Fill with 101 so it's below the binarize threshold of 100.

In [None]:
timesData['world_rank'].fillna(101, inplace=True)

Binarizer converts value to 1 if it's above the threshold so we need to invert world rank (make negative).

In [None]:
timesData['world_rank'] = (timesData['world_rank'] * -1)

- prepare object or string columns for numeric conversion
- Few columns had "Missing" replaced with 0
- num students has ",", replace with nothing ""
- international students has "%", replace with nothing ""

In [None]:
str_cols = timesData.select_dtypes(['object']).columns
timesData[str_cols] = timesData[str_cols].replace('Missing', 0)
timesData['num_students'] = timesData['num_students'].str.replace(',', '')
timesData['international_students'] = timesData['international_students'].str.replace('%', '')

Convert object or string columns to numeric.

In [None]:
timesData[str_cols] = timesData[str_cols].apply(pd.to_numeric, errors='coerce', axis=1)

Convert international students percentage to decimal.

In [None]:
timesData['international_students'] = timesData['international_students'] / 100

Determine number of ***NaN***s.

In [None]:
timesData.isna().sum()

year                       0
teaching                   0
international              0
research                   0
citations                  0
income                     0
num_students              59
student_staff_ratio        0
international_students    67
world_rank                 0
dtype: int64

Drop remaining ***NaN***s. 

In [None]:
timesData.dropna(inplace=True)

Check dataframe, dtypes and ***NaN***s.

In [None]:
print(timesData.dtypes)
print(timesData.isna().sum())
timesData.head()

year                        int64
teaching                  float64
international             float64
research                  float64
citations                 float64
income                    float64
num_students              float64
student_staff_ratio       float64
international_students    float64
world_rank                float64
dtype: object
year                      0
teaching                  0
international             0
research                  0
citations                 0
income                    0
num_students              0
student_staff_ratio       0
international_students    0
world_rank                0
dtype: int64


Unnamed: 0,year,teaching,international,research,citations,income,num_students,student_staff_ratio,international_students,world_rank
0,2011,99.7,72.4,98.7,98.8,34.5,20152.0,8.9,0.0025,-1.0
1,2011,97.7,54.6,98.0,99.9,83.7,2243.0,6.9,0.0027,-2.0
2,2011,97.8,82.3,91.4,99.9,87.5,11074.0,9.0,0.0033,-3.0
3,2013,96.3,59.8,99.4,99.7,95.6,2243.0,6.9,0.0027,-1.0
4,2016,95.6,64.0,97.6,99.8,97.8,2243.0,6.9,0.0027,-1.0


Convert times dataframe to array.

In [None]:
times_array = timesData.values
X = times_array[:]
y_ = times_array[:,[len(times_array[0]) - 1]]

In [None]:
set_printoptions(precision=3, suppress=True)
X[0]

array([ 2011.   ,    99.7  ,    72.4  ,    98.7  ,    98.8  ,    34.5  ,
       20152.   ,     8.9  ,     0.003,    -1.   ])

In [None]:
y_[:5]

array([[-1.],
       [-2.],
       [-3.],
       [-1.],
       [-1.]])

Drop `world_rank`, not needed.

In [None]:
timesData.drop(columns='world_rank', inplace=True)

In [None]:
times_array = timesData.values
X = times_array[:]
y_ = times_array[:,[len(times_array[0]) - 1]]

In [None]:
set_printoptions(precision=3, suppress=True)
X[0]

array([ 2011.   ,    99.7  ,    72.4  ,    98.7  ,    98.8  ,    34.5  ,
       20152.   ,     8.9  ,     0.003])

Create binary variable.

In [None]:
binarizer=Binarizer().fit(y_)
y_binary=binarizer.transform(y_)
y_reshaped = np.ravel(y_binary)
print(y_binary[:5],"\n\n",y_reshaped)

[[1.]
 [1.]
 [1.]
 [1.]
 [1.]] 

 [1. 1. 1. ... 1. 1. 1.]


Reshape using `ravel()`

In [None]:
y_reshaped = np.ravel(y_binary)
y_reshaped

array([1., 1., 1., ..., 1., 1., 1.])

In [None]:
X[0]

array([ 2011.   ,    99.7  ,    72.4  ,    98.7  ,    98.8  ,    34.5  ,
       20152.   ,     8.9  ,     0.003])

In [None]:
timesData.head()

Unnamed: 0,year,teaching,international,research,citations,income,num_students,student_staff_ratio,international_students
0,2011,99.7,72.4,98.7,98.8,34.5,20152.0,8.9,0.0025
1,2011,97.7,54.6,98.0,99.9,83.7,2243.0,6.9,0.0027
2,2011,97.8,82.3,91.4,99.9,87.5,11074.0,9.0,0.0033
3,2013,96.3,59.8,99.4,99.7,95.6,2243.0,6.9,0.0027
4,2016,95.6,64.0,97.6,99.8,97.8,2243.0,6.9,0.0027


In [None]:
skf=model_selection.StratifiedKFold(n_splits=5)
skf.get_n_splits(X_train, y_train)
print(skf)

StratifiedKFold(n_splits=5, random_state=None, shuffle=False)


In [None]:
reglasso = linear_model.LogisticRegression(penalty='l1')
reglasso.fit(X_train, y_train) 







LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
parameters = {'C':[1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2, 1, 5, 10]}
reglasso_tuned = linear_model.LogisticRegression(penalty='l2',multi_class='auto',solver='lbfgs',max_iter=10000)
reglasso_gridsearch = model_selection.GridSearchCV(reglasso_tuned, parameters, cv=5)
reglasso_gridsearch.fit(X_train, y_train)
print(reglasso_gridsearch.best_score_)
print(reglasso_gridsearch.best_params_)


The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.



0.3511652542372881
{'C': 1}


In [None]:
reglasso_final = linear_model.LogisticRegression(penalty='l2',C=1,multi_class='auto',solver='lbfgs',max_iter=10000)
reglasso_final.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
predicted = reglasso_final.predict(X_test)
metrics.accuracy_score(y_test,predicted)

0.3389830508474576

In [None]:
reglasso_final.coef_

array([[-0.085, -0.03 ,  0.3  ,  0.023, -0.068, -0.   , -0.207,  0.002,
        -0.011],
       [-0.05 , -0.026,  0.165,  0.043, -0.036, -0.   , -0.079, -0.   ,
        -0.009],
       [-0.065, -0.006,  0.13 ,  0.037, -0.02 , -0.   , -0.077,  0.001,
        -0.008],
       [-0.017, -0.002,  0.06 ,  0.028, -0.019, -0.   , -0.004,  0.002,
        -0.005],
       [ 0.005, -0.006,  0.004,  0.022, -0.011, -0.   ,  0.036, -0.001,
        -0.003],
       [-0.003,  0.   , -0.029,  0.008,  0.014,  0.   ,  0.046, -0.001,
        -0.001],
       [-0.   ,  0.006, -0.058,  0.01 ,  0.011,  0.   ,  0.043, -0.   ,
         0.007],
       [ 0.017,  0.013, -0.087, -0.005,  0.02 ,  0.   ,  0.057, -0.001,
         0.008],
       [ 0.038,  0.013, -0.092, -0.011,  0.004,  0.   ,  0.045, -0.   ,
         0.003],
       [ 0.05 ,  0.011, -0.15 , -0.042,  0.045,  0.   ,  0.048, -0.001,
         0.005],
       [ 0.111,  0.027, -0.242, -0.114,  0.059,  0.   ,  0.093, -0.001,
         0.014]])

In [None]:
from sklearn import tree

dectree = tree.DecisionTreeClassifier()
dectree = dectree.fit(X_train, y_train)
np.mean(model_selection.cross_val_score(dectree,X_train,y_train,cv=skf))

0.5201422415630996

In [None]:
parameters = {'min_samples_split':[1.0,3,5,7,9]}
dectree_tuned = tree.DecisionTreeClassifier()
dectree_gridsearch = model_selection.GridSearchCV(dectree_tuned, parameters, cv=5)
dectree_gridsearch.fit(X_train, y_train)
print(dectree_gridsearch.best_score_)
print(dectree_gridsearch.best_params_)

0.520656779661017
{'min_samples_split': 7}


In [None]:
dectree_final = tree.DecisionTreeClassifier(min_samples_split=7)
dectree_final.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=7,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [None]:
predicted = dectree_final.predict(X_test)
metrics.accuracy_score(y_test,predicted)

0.4745762711864407

In [None]:
tree.export_graphviz(dectree_final,out_file='tree.dot')

In [None]:
from subprocess import check_call
check_call(['dot','-Tpng','tree.dot','-o','tree.png'])

0

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 1)
knn.fit(X_train,y_train)
np.mean(model_selection.cross_val_score(knn,X_train,y_train,cv=skf))

0.48829746035519417

In [None]:
parameters = {'n_neighbors':[1,3,5,7,9]}
knn_tuned = KNeighborsClassifier()
knn_gridsearch = model_selection.GridSearchCV(knn_tuned, parameters, cv=5)
knn_gridsearch.fit(X_train, y_train)
print(knn_gridsearch.best_score_)
print(knn_gridsearch.best_params_)

0.4883474576271186
{'n_neighbors': 1}


In [None]:
knn_final = KNeighborsClassifier(n_neighbors = 1)
knn_final.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [None]:
predicted = knn_final.predict(X_test)
metrics.accuracy_score(y_test,predicted)

0.5211864406779662

In [None]:
from sklearn.ensemble import BaggingClassifier

bagging = BaggingClassifier(KNeighborsClassifier(n_neighbors = 1),
                            max_samples=0.5, max_features=0.5)
bagging.fit(X_train,y_train)
np.mean(model_selection.cross_val_score(bagging,X_train,y_train,cv=skf))

0.47618046019461185

In [None]:
classifier = svm.SVC(gamma='auto',kernel='rbf',C=1.0)
classifier.fit(X_train, y_train)
np.mean(model_selection.cross_val_score(classifier,X_train,y_train,cv=skf))

0.17364213963065298

In [None]:
parameters = {'C':[1,10,100, 1000],'gamma':[0.001, 0.0001]}
classifier_tuned = svm.SVC(kernel="rbf")
classifier_gridsearch = model_selection.GridSearchCV(classifier_tuned, parameters, cv=5)
classifier_gridsearch.fit(X_train, y_train)
print(classifier_gridsearch.best_score_)
print(classifier_gridsearch.best_params_)

0.4973516949152542
{'C': 100, 'gamma': 0.0001}


In [None]:
classifier_final = svm.SVC(gamma=0.001,kernel='rbf',C=10.0)
classifier_final.fit(X_train, y_train)

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
predicted = classifier_final.predict(X_test)
metrics.accuracy_score(y_test,predicted)

0.559322033898305

In [None]:
from sklearn.ensemble import RandomForestClassifier

randomforest=RandomForestClassifier(n_estimators=100)
randomforest.fit(X_train, y_train)
np.mean(model_selection.cross_val_score(randomforest,X_train,y_train,cv=skf))

0.5925670975156185

In [None]:
parameters = {'n_estimators':[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
randomforest_tuned = RandomForestClassifier()
randomforest_gridsearch = model_selection.GridSearchCV(randomforest_tuned, parameters, cv=5)
randomforest_gridsearch.fit(X_train, y_train)
print(randomforest_gridsearch.best_score_)
print(randomforest_gridsearch.best_params_)

In [None]:
randomforest_final=RandomForestClassifier(n_estimators=600)
randomforest_final.fit(X_train, y_train)

In [None]:
predicted = randomforest_final.predict(X_test)
metrics.accuracy_score(y_test,predicted)

### **Logistic Regression:** Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
rfe = RFE(LogisticRegression(solver='liblinear'), 3)
fit = rfe.fit(X, y_reshaped)
print(f'Number of features {fit.n_features_:d}\nSelected features {fit.support_}\nRanking of features {fit.ranking_}\n\nTop 3 features seem to be teaching, international rating, and citations')

In [None]:
rfe = RFE(LogisticRegression(solver='liblinear'), 1)
fit = rfe.fit(X, y_reshaped)
print(f'Number of features {fit.n_features_:d}\nSelected features {fit.support_}\nRanking of features {fit.ranking_}\n\nTop feature: international rating')

### **Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier

#random seed
RSEED = 50

c=timesData_v2.world_rank_bin_num.values
f=timesData_v2[['teaching','international','research','citations','income','num_students','student_staff_ratio','international_students','female_percentage']].values

tree = DecisionTreeClassifier(random_state=RSEED)
tree.fit(f,c)

In [None]:
print(f'Decision tree has {tree.tree_.node_count} nodes with maximum depth {tree.tree_.max_depth}.')

In [None]:
print(f'Model Accuracy: {tree.score(f, c)}')

In [None]:
from sklearn.model_selection import train_test_split
# 30% examples in test data
train, test, train_labels, test_labels = train_test_split(f, c, 
                                                          stratify = c,
                                                          test_size = 0.3, 
                                                          random_state = RSEED)

In [None]:
# Train tree
tree.fit(train, train_labels)
print(f'Decision tree has {tree.tree_.node_count} nodes with maximum depth {tree.tree_.max_depth}.')

In [None]:
print(f'Model Accuracy: {tree.score(f, c)}')

In [None]:
# Limit maximum depth and train
short_tree = DecisionTreeClassifier(max_depth=9, random_state=RSEED)
short_tree.fit(train, train_labels)

print(f'Model Accuracy: {short_tree.score(test, test_labels)}')

### **Random Forest:** Recursive Feature Elimination

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=1000, 
                               random_state=RSEED, 
                               max_depth=19,)
# Fit on training data
model.fit(train, train_labels)

In [None]:
n_nodes = []
max_depths = []

for ind_tree in model.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

In [None]:
print(f'Model Accuracy: {model.score(test, test_labels)}')

In [None]:
from sklearn.tree import export_graphviz
from subprocess import call
from IPython.display import Image

# Export as dot
export_graphviz(tree, 'tree.dot', rounded = True, filled = True)

# Convert to png
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=400']);

Image('tree.png')

### Univariate selection using **Chi-squared**

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2 

In [None]:
X[0]

Feature selection (we select the 3 best).

In [None]:
test = SelectKBest(score_func=chi2, k=3)
fit = test.fit(X,y_reshaped)
print("Scores\n",fit.scores_,"\nThe 3 attributes with the highest scores are: teaching, research and num_students \n\nteaching: university score for teaching\nreserach: university score for research (volume, income and reputation)\nnum_students: number of students at the university")

features = fit.transform(X)
features[0:5,:]

### Ranking feature importance using **Extra Tree Classifier**

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier(n_estimators=100)
model.fit(X,y_reshaped)

print(model.feature_importances_,"\n\nTop features seem to be citations, research and teaching")

## Model evaluation 

### train-test-split and k-fold-10 validation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
train_test_split_accuracy = []
k_fold_accuracy = []

In [None]:
binarizer=Binarizer().fit(y_)
y_binary=binarizer.transform(y_)
y_reshaped = np.ravel(y_binary)

We need to make it reproducible, so we use a seed for the pseudo-random.

In [None]:
print('train-test-split')
test_size = 0.3
seed = 7

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_reshaped, test_size=test_size, random_state=seed)

**Ridge Regression**

In [None]:
from sklearn.linear_model import RidgeClassifier

ridge_regression = RidgeClassifier().fit(X_train,y_train)
ridge_regression

In [None]:
result = ridge_regression.score(X_test,y_test)
train_test_split_accuracy.append(result)
print(f'Accuracy {result*100:5.3f} %\nk-fold-10 validation')

**Logistic Regression**

In [None]:
logistic_regression = LogisticRegression(solver='liblinear')
logistic_regression.fit(X_train,y_train)

In [None]:
result = logistic_regression.score(X_test,y_test)
train_test_split_accuracy.append(result)
print(f'Accuracy {result*100:5.3f} %\nk-fold-10 validation')

**Random Forest**

In [None]:
random_forest = RandomForestClassifier(n_estimators=100,max_depth=2,random_state=0)
random_forest.fit(X_train,y_train)  

In [None]:
result = random_forest.score(X_test,y_test)
train_test_split_accuracy.append(result)
print(f'Accuracy {result*100:5.3f} %\nk-fold-10 validation')

### KFold

In [None]:
splits = 10
kfold = KFold(n_splits=splits, random_state=seed)

Obtain the performance measure - accuracy

In [None]:
train_test_accuracy = [ '%.3f' % elem for elem in train_test_split_accuracy]
print(train_test_accuracy)

In [None]:
results = cross_val_score(ridge_regression, X, y_reshaped, cv=kfold)
k_fold_accuracy.append(results.mean())

print(f'Ridge Regression, k-fold {splits:d} - Accuracy {results.mean()*100:5.3f}% ({results.std()*100:5.3f}%)')

In [None]:
results = cross_val_score(logistic_regression, X, y_reshaped, cv=kfold)
k_fold_accuracy.append(results.mean())

print(f'Logistic Regression, k-fold {splits:d} - Accuracy {results.mean()*100:5.3f}% ({results.std()*100:5.3f}%)')

In [None]:
results = cross_val_score(random_forest, X, y_reshaped, cv=kfold)
k_fold_accuracy.append(results.mean())

print(f'Random Forests, k-fold {splits:d} - Accuracy {results.mean()*100:5.3f}% ({results.std()*100:5.3f}%)')

In [None]:
kfold_accuracy = [ '%.3f' % elem for elem in k_fold_accuracy]
kfold_accuracy

Accuracy decreases as the number of universities to be classified increases.

### Metrics evaluation using StratifiedKFold

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
scoring = ['accuracy', 'neg_log_loss', 'roc_auc']
k_fold_accuracy = []

In [None]:
binarizer=Binarizer().fit(y_)
y_binary=binarizer.transform(y_)
y_reshaped = np.ravel(y_binary)

- StratifiedKFold.
- Obtain the performance measure - accuracy.

In [None]:
print("Ridge Regression\n")
for score in scoring:
    print(score, ', top')
    splits = 10
    skfold = StratifiedKFold(n_splits=splits, random_state=7)
    results = cross_val_score(ridge_regression, X, y_reshaped, cv=skfold)
    print(score, f': {results.mean():.3f}')

Y_predicted = ridge_regression.predict(X_test)
c_matrix=confusion_matrix(y_test, Y_predicted)
print('\nConfusion Matrix\n\n',c_matrix,f'\n\nAccuracy {ridge_regression.score(X_test, y_test)*100:.3f}\nAccuracy check with conf. matrix {(c_matrix[0,0]+c_matrix[1,1])/c_matrix.sum()*100:.3f}')    

In [None]:
print("Logistic Regression\n")
for score in scoring:
    print(score, ', top')
    splits = 10
    skfold = StratifiedKFold(n_splits=splits, random_state=7)
    results = cross_val_score(logistic_regression, X, y_reshaped, scoring=score, cv=skfold)
    print(score, f': {results.mean():.3f}')

Y_predicted = logistic_regression.predict(X_test)
c_matrix=confusion_matrix(y_test, Y_predicted)
print('\nConfusion Matrix\n\n',c_matrix,f'\n\nAccuracy {logistic_regression.score(X_test, y_test)*100:.3f}\nAccuracy check with conf. matrix {(c_matrix[0,0]+c_matrix[1,1])/c_matrix.sum()*100:.3f}')    

In [None]:
print("Random Forest\n")
for score in scoring:
    print(score, ', top')
    splits = 10
    skfold = StratifiedKFold(n_splits=splits, random_state=7)
    results = cross_val_score(random_forest, X, y_reshaped, scoring=score, cv=skfold)
    print(score, f': {results.mean():.3f}')

Y_predicted = random_forest.predict(X_test)
c_matrix=confusion_matrix(y_test, Y_predicted)
print('\nConfusion Matrix\n\n',c_matrix,f'\n\nAccuracy {random_forest.score(X_test, y_test)*100:.3f}\nAccuracy check with conf. matrix {(c_matrix[0,0]+c_matrix[1,1])/c_matrix.sum()*100:.3f}')    

In [None]:
x2016 = preProcessedTimesData.country[preProcessedTimesData.year == 2016]
plt.subplots(figsize=(8,8))
wordcloud = WordCloud(
                          background_color='white',
                          width=512,
                          height=384
                         ).generate(" ".join(x2016))
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('graph.png')

plt.show()

In [None]:
configure_plotly_browser_state()

dataframe = preProcessedTimesData[preProcessedTimesData.year == 2015]
data2015 = dataframe.loc[:,["research","international", "total_score"]]
data2015["index"] = np.arange(1,len(data2015)+1)

trace1 = go.Scatter3d(
    x=dataframe.world_rank,
    y=dataframe.research,
    z=dataframe.citations,
    mode='markers',
    marker=dict(size=10,color='rgb(255,0,0)',)
)

data = [trace1]
layout = go.Layout(
    scene = dict(xaxis_title='World Rank',yaxis_title='Research',zaxis_title='Citations'),
    margin = dict(l=0,r=0,b=0,t=0)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
configure_plotly_browser_state()
import plotly.figure_factory as ff

fig = ff.create_scatterplotmatrix(data2015, diag='box', index='index',colormap='Portland',
                                  colormap_type='cat',
                                  height=700, width=700)
iplot(fig)

In [None]:
configure_plotly_browser_state()

trace1 = go.Scatter(
    x=dataframe.world_rank,
    y=dataframe.research,
    name = "research"
)
trace2 = go.Scatter(
    x=dataframe.world_rank,
    y=dataframe.citations,
    xaxis='x2',
    yaxis='y2',
    name = "citations"
)
trace3 = go.Scatter(
    x=dataframe.world_rank,
    y=dataframe.income,
    xaxis='x3',
    yaxis='y3',
    name = "income"
)
trace4 = go.Scatter(
    x=dataframe.world_rank,
    y=dataframe.total_score,
    xaxis='x4',
    yaxis='y4',
    name = "total_score"
)
data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
    xaxis=dict(
        domain=[0, 0.45]
    ),
    yaxis=dict(
        domain=[0, 0.45]
    ),
    xaxis2=dict(
        domain=[0.55, 1]
    ),
    xaxis3=dict(
        domain=[0, 0.45],
        anchor='y3'
    ),
    xaxis4=dict(
        domain=[0.55, 1],
        anchor='y4'
    ),
    yaxis2=dict(
        domain=[0, 0.45],
        anchor='x2'
    ),
    yaxis3=dict(
        domain=[0.55, 1]
    ),
    yaxis4=dict(
        domain=[0.55, 1],
        anchor='x4'
    ),
    title = 'Research, citation, income and total score VS World Rank of Universities'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
configure_plotly_browser_state()

trace1 = go.Scatter(
    x=dataframe.world_rank,
    y=dataframe.teaching,
    name = "teaching",
    marker = dict(color = 'rgba(16, 112, 2, 0.8)'),
)

trace2 = go.Scatter(
    x=dataframe.world_rank,
    y=dataframe.income,
    xaxis='x2',
    yaxis='y2',
    name = "income",
    marker = dict(color = 'rgba(160, 112, 20, 0.8)'),
)
data = [trace1, trace2]
layout = go.Layout(
    xaxis2=dict(
        domain=[0.6, 0.95],
        anchor='y2',        
    ),
    yaxis2=dict(
        domain=[0.6, 0.95],
        anchor='x2',
    ),
    title = 'Income and Teaching vs World Rank of Universities'

)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
configure_plotly_browser_state()

x2015 = preProcessedTimesData[preProcessedTimesData.year == 2015]

trace0 = go.Box(
    y=x2015.total_score,
    name = 'total score of universities in 2015',
    marker = dict(
        color = 'rgb(12, 12, 140)',
    )
)
trace1 = go.Box(
    y=x2015.research,
    name = 'research of universities in 2015',
    marker = dict(
        color = 'rgb(12, 128, 128)',
    )
)
data = [trace0, trace1]
iplot(data)

In [None]:
configure_plotly_browser_state()
df = preProcessedTimesData.iloc[:10,:]
import plotly.graph_objs as go

trace1 = go.Scatter(
                    x = df.world_rank,
                    y = df.citations,
                    mode = "lines",
                    name = "citations",
                    marker = dict(color = 'rgba(16, 112, 2, 0.8)'),
                    text= df.university_name)

trace2 = go.Scatter(
                    x = df.world_rank,
                    y = df.teaching,
                    mode = "lines+markers",
                    name = "teaching",
                    marker = dict(color = 'rgba(80, 26, 80, 0.8)'),
                    text= df.university_name)
data = [trace1, trace2]
layout = dict(title = 'Citation and Teaching vs World Rank of Top 100 Universities',
              xaxis= dict(title= 'World Rank',ticklen= 5,zeroline= False)
             )
fig = dict(data = data, layout = layout)
iplot(fig)

In [None]:
configure_plotly_browser_state()

df2016 = preProcessedTimesData[preProcessedTimesData.year == 2016].iloc[:7,:]
pie1 = df2016.num_students
pie1_list = [float(each.replace(',', '.')) for each in df2016.num_students]  # str(2,4) => str(2.4) = > float(2.4) = 2.4
labels = df2016.university_name

fig = {
  "data": [
    {
      "values": pie1_list,
      "labels": labels,
      "domain": {"x": [0, .5]},
      "name": "Number Of Students Rates",
      "hoverinfo":"label+percent+name",
      "hole": .3,
      "type": "pie"
    },],
  "layout": {
        "title":"Universities Number of Students rates",
        "annotations": [
            { "font": { "size": 20},
              "showarrow": False,
              "text": "Number of Students",
                "x": 0.20,
                "y": 1
            },
        ]
    }
}
iplot(fig)

In [None]:
percentage_people_below_poverty_level.head()

In [None]:
percentage_people_below_poverty_level.poverty_rate.replace(['-'],0.0,inplace = True)
percentage_people_below_poverty_level.poverty_rate = percentage_people_below_poverty_level.poverty_rate.astype(float)
area_list = list(percentage_people_below_poverty_level['Geographic Area'].unique())
area_poverty_ratio = []
for i in area_list:
    x = percentage_people_below_poverty_level[percentage_people_below_poverty_level['Geographic Area']==i]
    area_poverty_rate = sum(x.poverty_rate)/len(x)
    area_poverty_ratio.append(area_poverty_rate)
data = pd.DataFrame({'area_list': area_list,'area_poverty_ratio':area_poverty_ratio})
new_index = (data['area_poverty_ratio'].sort_values(ascending=False)).index.values
sorted_data = data.reindex(new_index)

In [None]:
percent_over_25_completed_highSchool.percent_completed_hs.replace(['-'],0.0,inplace = True)
percent_over_25_completed_highSchool.percent_completed_hs = percent_over_25_completed_highSchool.percent_completed_hs.astype(float)
area_list = list(percent_over_25_completed_highSchool['Geographic Area'].unique())
area_highschool = []
for i in area_list:
    x = percent_over_25_completed_highSchool[percent_over_25_completed_highSchool['Geographic Area']==i]
    area_highschool_rate = sum(x.percent_completed_hs)/len(x)
    area_highschool.append(area_highschool_rate)

data = pd.DataFrame({'area_list': area_list,'area_highschool_ratio':area_highschool})
new_index = (data['area_highschool_ratio'].sort_values(ascending=True)).index.values
sorted_data2 = data.reindex(new_index)

sorted_data['area_poverty_ratio'] = sorted_data['area_poverty_ratio']/max( sorted_data['area_poverty_ratio'])
sorted_data2['area_highschool_ratio'] = sorted_data2['area_highschool_ratio']/max( sorted_data2['area_highschool_ratio'])
data = pd.concat([sorted_data,sorted_data2['area_highschool_ratio']],axis=1)
data.sort_values('area_poverty_ratio',inplace=True)

f,ax1 = plt.subplots(figsize =(10,5))
sns.pointplot(x='area_list',y='area_poverty_ratio',data=data,color='lime',alpha=0.8)
sns.pointplot(x='area_list',y='area_highschool_ratio',data=data,color='red',alpha=0.8)
plt.text(40,0.6,'high school graduate ratio',color='red',fontsize = 17,style = 'italic')
plt.text(40,0.55,'poverty ratio',color='lime',fontsize = 18,style = 'italic')
plt.xlabel('States',fontsize = 15,color='blue')
plt.ylabel('Values',fontsize = 15,color='blue')
plt.title('High School Graduate  VS  Poverty Rate',fontsize = 20,color='blue')
plt.grid()

Visualization of high school graduation rate vs Poverty rate of each state with different style of seaborn code
joint kernel density

- pearsonr= if it is 1, there is positive correlation and if it is, -1 there is negative correlation.
- If it is zero, there is no correlation between variables
- Show the joint distribution using kernel density estimation 

In [None]:
g = sns.jointplot(data.area_poverty_ratio, data.area_highschool_ratio, kind="kde", size=7)
plt.savefig('graph.png')
plt.show()

{ “scatter” | “reg” | “resid” | “kde” | “hex” }
- Different usage of parameters but same plot with previous one

In [None]:
g = sns.jointplot("area_poverty_ratio", "area_highschool_ratio", data=data,size=5, ratio=3, color="r")

In [None]:
data.head()

In [None]:
sns.lmplot(x="area_poverty_ratio", y="area_highschool_ratio", data=data)
plt.show()

In [None]:
sns.kdeplot(data.area_poverty_ratio, data.area_highschool_ratio, shade=True, cut=3)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=sorted_data['area_list'], y=sorted_data['area_poverty_ratio'])
plt.xticks(rotation= 45)
plt.xlabel('States')
plt.ylabel('Poverty Rate')
plt.title('Poverty Rate Given States')

In [None]:
pal = sns.cubehelix_palette(2, rot=-.5, dark=.3)
sns.violinplot(data=data, palette=pal, inner="points")
plt.show()

In [None]:
f,ax = plt.subplots(figsize=(5, 5))
sns.heatmap(data.corr(), annot=True, linewidths=0.5,linecolor="red", fmt= '.1f',ax=ax)
plt.show()

In [None]:
sns.pairplot(data)
plt.show()

In [None]:
df2014 = preProcessedTimesData[preProcessedTimesData.year == 2014].iloc[:3,:]
df2014

In [None]:
df2014 = preProcessedTimesData[preProcessedTimesData.year == 2014].iloc[:3,:]

configure_plotly_browser_state()
import plotly.graph_objs as go

trace1 = go.Bar(
                x = df2014.university_name,
                y = df2014.citations,
                name = "citations",
                marker = dict(color = 'rgba(255, 174, 255, 0.5)',
                             line=dict(color='rgb(0,0,0)',width=1.5)),
                text = df2014.country)

trace2 = go.Bar(
                x = df2014.university_name,
                y = df2014.teaching,
                name = "teaching",
                marker = dict(color = 'rgba(255, 255, 128, 0.5)',
                              line=dict(color='rgb(0,0,0)',width=1.5)),
                text = df2014.country)
data = [trace1, trace2]
layout = go.Layout(barmode = "group")
fig = go.Figure(data = data, layout = layout)
iplot(fig)

In [None]:
configure_plotly_browser_state()

df2016 = preProcessedTimesData[preProcessedTimesData.year == 2016].iloc[:20,:]
num_students_size  = [float(each.replace(',', '.')) for each in df2016.num_students]
international_color = [float(each) for each in df2016.international]
data = [
    {
        'y': df2016.teaching,
        'x': df2016.world_rank,
        'mode': 'markers',
        'marker': {
            'color': international_color,
            'size': num_students_size,
            'showscale': True
        },
        "text" :  df2016.university_name    
    }
]
iplot(data)

In [None]:
df2014 = preProcessedTimesData[preProcessedTimesData.year == 2014].iloc[:10,:]
df2015 = preProcessedTimesData[preProcessedTimesData.year == 2015].iloc[:10,:]
df2016 = preProcessedTimesData[preProcessedTimesData.year == 2016].iloc[:10,:]

configure_plotly_browser_state()
import plotly.graph_objs as go

trace1 =go.Scatter(
                    x = df2014.world_rank,
                    y = df2014.citations,
                    mode = "markers",
                    name = "2014",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text= df2014.university_name)

trace2 =go.Scatter(
                    x = df2015.world_rank,
                    y = df2015.citations,
                    mode = "markers",
                    name = "2015",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text= df2015.university_name)

trace3 =go.Scatter(
                    x = df2016.world_rank,
                    y = df2016.citations,
                    mode = "markers",
                    name = "2016",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text= df2016.university_name)
data = [trace1, trace2, trace3]
layout = dict(title = 'Citation vs world rank of top 100 universities with 2014, 2015 and 2016 years',
              xaxis= dict(title= 'World Rank',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'Citation',ticklen= 5,zeroline= False)
             )
fig = dict(data = data, layout = layout)
iplot(fig)