# Neighbourhood Prediction

**You should almost certainly run the [Script](08-Neighbourhood Prediction.py) instead since I cannot guarantee that the Jupyter server will not timeout after a period of seeming inactivity and cause potential data loss.**

However, this shows in a slightly more accessible form the same content as appears in the script so you are welcome to use this for exploratory purposes provided that you understand the likely impact of attempting to run the full GridSearch that is at the heart of this analysis.

In [None]:
# Needed on a Mac
import matplotlib as mpl
mpl.use('TkAgg')
%matplotlib inline
import matplotlib.pyplot as plt 

In [None]:
# For reproducibility
import random
import numpy as np
r_state = 42
random.seed(r_state) 
np.random.seed(r_state)

In [None]:
import os
import re
import pandas as pd
import seaborn as sns

import sklearn
print('Your scikit-learn version is {}.'.format(sklearn.__version__))
print('Please check it is at least 0.18.0.')

from sklearn.preprocessing import scale
from sklearn import linear_model
from sklearn import tree
from sklearn import preprocessing
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics  
from sklearn import ensemble

from sklearn.externals.six import StringIO
#from sklearn.model_selection import GridSearchCV
#from sklearn.feature_selection import SelectKBest 
#from sklearn.feature_selection import f_regression

from timeit import default_timer as timer
import datetime

In [None]:
analytical = os.path.join('data','analytical')
output     = os.path.join(os.path.expanduser('~'),'Documents','Dropbox','ESRC Gentrification','data','analytical')

def load_status_scores(dtype):
    status = pd.read_csv(os.path.join(analytical,dtype+'-Scores.csv.gz'), index_col=0)  # SES scores
    
    # Scores
    status.drop(['RANK_01','RANK_11'], axis=1, inplace=True)
    status.rename(columns={
        'SES_01':'SES 2001',
        'SES_11':'SES 2011',
        'SES_ASC':'SES Ascent 2001-2011',
        'SES_PR_01':'SES 2001 Percentile', # 99 = High-status
        'SES_PR_11':'SES 2011 Percentile', # 99 = High-status
        'SES_PR_ASC':'SES Percentile Ascent 2001-2011'
    }, inplace=True)
    return status

def load_predictors(dtype):
    
    return status

def classifier_report(clf, y_true, y_hat):
    
    txt = ''
    
    # If the task is regression evaluate using regression metrics, 
    # otherwise evaluate using classification metrics
    txt += "R2:        {0:8.5f}".format(metrics.r2_score(y_true, y_hat)) + "\n" #  R2 - Coefficient of determination
    txt += "MSE:       {0:8.5f}".format(metrics.mean_squared_error(y_true, y_hat)) + "\n"  #  Mean squared error regression loss
    txt += "MAE:       {0:8.5f}".format(metrics.mean_absolute_error(y_true, y_hat)) + "\n"  #  Mean absolute error regression loss
    txt += "Expl. Var: {0:8.5f}".format(metrics.explained_variance_score(y_true, y_hat)) + "\n"  # Explained variance regression score function
    txt += "\n"
    
    #print(metrics.accuracy_score(y_true, y_pred))  #  Accuracy Score
    #print(metrics.classification_report(y_true, y_pred, target_names=["Unascended","Ascended"]))  #  Classification Report
    #print(metrics.confusion_matrix(y_true, y_pred))  #  Confusion Matrix
    #print()
    return txt

## Information About Variable Utility (Optional)

The code below evaluates the significance of each variable using the F-regression function in Scikit-Learn, and then sorts the results in ascending order.  The results are merged with data from the variable database.

We don't actually use the output of this next step to perform feature selection as the Random Forest will take care of that for us. This is simply a way of understanding the relative utility of different variables to linear-type models (of which the RF is _not_ one).

In [None]:
to_use = 'Untransformed'

SES = load_status_scores(to_use)  # SES scores in 2011

#  Read the transformed data
d01_trs2 = pd.read_csv(os.path.join(analytical,to_use+'-2001-Data-Transformed_and_Scaled.csv.gz'), index_col=0)
d11_trs2 = pd.read_csv(os.path.join(analytical,to_use+'-2011-Data-Transformed_and_Scaled.csv.gz'), index_col=0)

# Data about variables used later in process
vardb = pd.read_csv(os.path.join('data','variables.csv'), index_col=False)
vardb.drop('Description', axis=1, inplace=True)

In [None]:
s1 = set(vardb.Predictor.values)
s2 = set(d01_trs2.columns.values)
if s2.difference(s1):
    print(s2.difference(s1))

In [None]:
kb = feature_selection.SelectKBest(feature_selection.f_regression, k='all')             #  Evaluate f-regression to evaluate all variables
kb.fit(d01_trs2, SES.loc[:,'SES Ascent 2001-2011']) #  Pass variable data from 2001 to find correlation with SES 11

# Check this!!!
print("Max f-test value: " + str(np.max(kb.scores_)))
f_test = kb.scores_
f_test /= np.max(f_test) # Normalise by maximum value (http://scikit-learn.org/stable/auto_examples/feature_selection/plot_f_test_vs_mi.html)

In [None]:
#  Format results and write to file
results = pd.DataFrame(data=f_test, index=d01_trs2.columns)
results.reset_index(inplace=True)
results.columns = ['Predictor','Score']
results.to_csv(os.path.join(analytical,to_use+'-Variable Results.csv'), index=False)

# Formatted results
fresults = vardb.loc[:,['Predictor','Title','Category','Group']].merge(results, on='Predictor', how='left')
fresults[['Title','Category','Score','Group']].sort_values(by='Score', ascending=False)
fresults.to_csv(os.path.join(analytical,to_use+'-Variable Importance.csv'), index=False)

# Results are...
fresults.sort_values(by='Score', ascending=False).head(5)

## Developing the Model

The code below is concerned with building the best predictive model for the period of 2001-2011.

In [None]:
# Can override to_use here
to_use = 'Untransformed'

SES = load_status_scores(to_use)  # SES scores in 2011

#  Read the transformed data
d01_trs2 = pd.read_csv(os.path.join(analytical,to_use+'-2001-Data-Transformed_and_Scaled.csv.gz'), index_col=0)
d11_trs2 = pd.read_csv(os.path.join(analytical,to_use+'-2011-Data-Transformed_and_Scaled.csv.gz'), index_col=0)

# Data about variables used later in process
vardb = pd.read_csv(os.path.join('data','variables.csv'), index_col=False)
vardb.drop('Description', axis=1, inplace=True)

To evaluate the models most reliably a portion of the dataset must be kept as holdout to evaluate the classifier on independently.  The code below splits the data into training and test sets using a test size of 20%.

In [None]:
X_full = d01_trs2

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    d01_trs2, SES['SES Ascent 2001-2011'], test_size=0.2, random_state=r_state)

#### Start Logging

In [None]:
log = open(os.path.join(to_use+'-Fit.txt'),'w')
print("Data Transform: " + to_use, file=log)
print("", file=log)

### Simple Linear Regression

To begin with modelling was attempted using 1R (i.e. 1 rule) modelling.  This code below uses Stochastic Loss Gradient to build a simple linear regression estimator using each variable separately.

In [None]:
#  Evaluate each predictor using simple linear regression
preds_ls = list(d01_trs2.columns)  #  List of predictors
df = pd.DataFrame(X_train, columns=d01_trs2.columns)
results_dict = dict()  #  Store results here

#  Loop over each predictor and evaluate it's performance
for p in preds_ls:
    #print("Evaluating: " + p)
    #X_sing = df[p].as_matrix()  #  Get predictor data from dataset   
    clf = linear_model.SGDRegressor(loss='squared_loss', penalty=None, random_state=r_state, max_iter=1000, tol=1e-3) #  Build Stochastic Gradient Descent estimator
    clf.fit(X_train[[p]],y_train)
    y_pred = clf.predict(X_test[[p]])
    sc  = metrics.r2_score(y_test, y_pred, multioutput='variance_weighted')
    mse = metrics.mean_squared_error(y_test, y_pred)  #  Mean squared error regression loss
    mae = metrics.mean_absolute_error(y_test, y_pred)  #  Mean absolute error regression loss
    var = metrics.explained_variance_score(y_test, y_pred)  # Explained variance regression score function
    
    results_dict[p] = [sc, mse, mae, var] 

results = pd.DataFrame.from_dict(results_dict, orient='index').sort_values(by=0, ascending=False)
results.reset_index(inplace=True)
results.columns = ['Predictor','R2-Score','MSE','MAE','Explained Variance']
df = vardb.loc[:,['Predictor','Category']].merge(results, on='Predictor', how='left' )
df.sort_values(by=['R2-Score'], ascending=False, inplace=True)
df.head(5)

In [None]:
# And for logging purposes
p   = df['Predictor'].iloc[0]
print(p)
clf = linear_model.SGDRegressor(loss='squared_loss', penalty=None, random_state=r_state, max_iter=1000, tol=1e-3)
clf.fit(X_train[[p]],y_train)
y_pred = clf.predict(X_test[[p]])

print("Singular Regression results:")
print(classifier_report(clf, y_test, y_pred), file=log)
print(classifier_report(clf, y_test, y_pred)) # clf, y_test, y_hat
print("", file=log)

**Remember that these are the results on the training data set using a fixed random seed and not the full data set, so changes to the seed/split will certainly change the results!**

For the Untransformed data set we get the following results:

| Predictor | Category | R2-Score | MSE | MAE | Explained Variance |
| --------- | -------- | -------- | ---- |---- | ------------- |
| House Prices | Scoring Metric | 0.54164 | 0.28576 | 0.33353 | 0.54405 |

For the Box-Cox transformed data set we get the following:

| Predictor | Category | R2-Score | MSE | MAE | Explained Variance |
| --------- | -------- | -------- | ---- |---- | ------------- |
| House Prices (Box-Cox Transformed) | Scoring Metric | 0.186940 | 0.075678 | 0.199408 | 0.198013 |

And for the Log-transformed data we get: 

| Predictor | Category | R2-Score | MSE | MAE | Explained Variance |
| --------- | -------- | -------- | ---- |---- | ------------- |
| G. Wholesale and retail | Industry of Employment | 0.087389 | 0.091154 | 0.219888 | 0.098506 |


### Multiple Regression

In [None]:
clf = linear_model.LinearRegression(fit_intercept=True, copy_X=True)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print("Multiple Regression results:", file=log)
print(classifier_report(clf, y_test, y_pred), file=log)
print(classifier_report(clf, y_test, y_pred))
print("", file=log)

For Multiple Linear Regression I get:

| Model | R2-Score | MSE | MAE | Explained Variance |
| ----- | -------- | ---- |---- | ------------- |
| Singular Regression | 0.54164 | 0.28576 | 0.33353 | 0.54405 |
| Multiple-Regression | 0.63932 | 0.22486 | 0.30493 | 0.64028 |

## Predicting the 'Future' (2001 > 2011)

The code below trains the model on training sets and then predicts the entire results of 2011.

### Baseline without Hyperparameter Tuning

Accepting only the default parameters for the models so that we have a baseline before tuning.

#### Extemely Random Trees

In [None]:
clf = ensemble.ExtraTreesRegressor(n_jobs=-1, random_state=r_state, n_estimators=100)  
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print("Default Extra Trees results:", file=log)
print(classifier_report(clf, y_test, y_pred), file=log)
print(classifier_report(clf, y_test, y_pred))
print("", file=log)

In [None]:
# Single best-performing option from Notebook 7
clf = ensemble.ExtraTreesRegressor(n_estimators=180, n_jobs=-1, random_state=r_state)  
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print("Individually Tuned Extra Trees results:")
print(classifier_report(clf, y_test, y_pred))

In [None]:
log.close()

For the two Extra-Tress Regressors I get:

| Model | R2-Score | MSE | MAE | Explained Variance |
| ----- | -------- | ---- |---- | ------------- |
| Singular Regression | 0.54164 | 0.28576 | 0.33353 | 0.54405 |
| Multiple-Regression | 0.63932 | 0.22486 | 0.30493 | 0.64028 |
| Un-Tuned RF | 0.66722 | 0.20747 | 0.28518 | 0.67187 |
| Single-Tuned RF| 0.69477 | 0.19029 | 0.26110 | 0.69934 |

### GridSearchCV (Run from Script)

Using a grid search to tune the hyperparameters. **This should actually be done using the included script of the same name since Jupyter may timeout while running the GridSearch (or, at least, it did for me).**

#### Extremely Random Trees

Similar to RandomForests we can tuen the following hyperparameters:
1. `n_estimators` (number of trees; difficult to overfit so large _n_ probably simplest starting point).
2. `max_depth` (maximum depth of trees; can encourage overfitting since more depth == more complexity).
3. `max_features` (maximum number of features to consider at each split; allows more complex models so may lead to overfitting).
4. `min_samples_leaf` (also helps to control depth and reduce overfitting by preventing splits that hold outliers).

Note that the permutations can pile up rather quickly when we incorporate additional parameters such as feature determination and bootstrapping with cross-validation. On a 2.9GHz Core i5 Mac with 16GB of RAM this is working out at about 16s per fold, so 100 candidates with 7 folds == 700 fits == 10,500s == 175m == 3hrs.

**Broadly: 100 fits ~= 45 minutes on a laptop.**

<span style="color:red;weight:bold">I ended up moving the code below to a script since I was experiencing timeouts on Jupyter.</span>

In [None]:
# Use a grid over parameters of interest -- search grid
# partly extracted from testing with notebook 7 and party
# from playing with grid ranges here (since results produced
# by manipulating one parameter separately from the others 
# don't always replicate well as the single tuned parameter 
# for the ensemble as a whole). In other words, just because
# max_depth==10 was the best result from manipulating _only_
# tree depth doesn't mean that it will be the best when you
# start manipulating all the main hyperparameters together.
param_grid = {
    "n_estimators"      : [int(x) for x in np.arange(start=160, stop=211, step=20)] +  
                           [int(x) for x in np.arange(start=1300, stop=1501, step=100)] +
                           [int(x) for x in np.arange(start=1800, stop=2001, step=100)],
    "max_depth"         : [None], # [int(x) for x in np.arange(start=10, stop=141, step=90)]+[None],
    "min_samples_leaf"  : [1,2,4], #
    "max_features"      : [None] # [0.7, 0.85, None], # For regression normally n_features (i.e. auto)
}

print("Estimators: " + str(param_grid['n_estimators']))
print("Depth: " + str(param_grid['max_depth']))
print("Minimum Samples Leaf: " + str(param_grid['min_samples_leaf']))
print("Maximum Features: " + str(param_grid['max_features']))
print("Number of permutations: " + str(len(param_grid['n_estimators']) * len(param_grid['max_depth']) * len(param_grid['max_features']) * len(param_grid['min_samples_leaf'])))

In [None]:
clf = ensemble.ExtraTreesRegressor(n_jobs=-1, random_state=r_state) # Can be 'mae' or 'mse' -- should presumably match scoring below
start = timer()
# There is some disagreement about whether cross-validation or bootstrapping 
# is needed for ExtraTrees (or even RandomForests) regressors:
# https://stats.stackexchange.com/questions/279163/cross-validation-in-extratreesregressor
scoring = {'mae':'neg_mean_absolute_error', 'mse':'neg_mean_squared_error'} #, 'r2':'r2'}
cv = model_selection.GridSearchCV(estimator=clf, param_grid=param_grid, cv=4, n_jobs=6, verbose=0, scoring='neg_mean_squared_error')
cv.fit(X_train, y_train)
duration = timer() - start
print("Execution complete in: {0:15.1f}s".format(duration) + " (" + str(datetime.timedelta(seconds=duration)) + ")")
print("Best score: " + str(cv.best_score_))
print("Done.")

In [None]:
print("Best score: " + str(cv.best_score_))

In [None]:
log = open(os.path.join(output,to_use+'-Fit.txt'),'a')

In [None]:
print("Params: ", file=log)
print(param_grid, file=log)
print("Best Cross-Validation score: " + str(cv.best_score_), file=log)

In [None]:
best_clf = cv.best_estimator_ # Extract the best estimator from the GridSearch
best_clf.fit(X_train, y_train)
y_pred  = best_clf.predict(X_test)

print("Best parameters from Cross-Validation: " + str(cv.best_params_), file=log)
print("Best parameters from Cross-Validation: " + str(cv.best_params_))
print("", file=log)

print("Cross-check against full spec of model: ", file=log)
print(best_clf.get_params, file=log)
print(best_clf.get_params)
print("", file=log)

print("Tuned Extra Trees result:", file=log)
print(classifier_report(best_clf, y_test, y_pred), file=log)
print(classifier_report(best_clf, y_test, y_pred))
print("", file=log)

# Create a data frame of feature importance so that we
# can inspect later...
fi = pd.DataFrame.from_dict({'feature':X_test.columns.values, 'importance':best_clf.feature_importances_})
fi.sort_values(by='importance', ascending=False, inplace=True)
fi.to_csv(os.path.join(analytical,to_use+'-Feature Importance.csv.gz'), compression='gzip', index=False)

print("Feature Importances (5 Biggest):", file=log)
print(fi.head(5), file=log)
print(fi.head(5))

In [None]:
log.close()

## Outputting Final Results 

Best performing model from testing across grid:
```
Cross-check against full spec of model: 
<bound method BaseEstimator.get_params of ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features=0.85, max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=2, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=1400, n_jobs=-1,
          oob_score=False, random_state=42, verbose=0, warm_start=False)>

Tuned Extra Trees result:
R2:         0.69899
MSE:        0.18766
MAE:        0.25969
Expl. Var:  0.70261
```

In [None]:
# Can override to_use here for other transformations
to_use = 'Untransformed'

SES = load_status_scores(to_use)  # SES scores in 2011

#  Read the transformed data
d01_trs2 = pd.read_csv(os.path.join(analytical,to_use+'-2001-Data-Transformed_and_Scaled.csv.gz'), index_col=0)
d11_trs2 = pd.read_csv(os.path.join(analytical,to_use+'-2011-Data-Transformed_and_Scaled.csv.gz'), index_col=0)

# Data about variables used later in process
vardb = pd.read_csv(os.path.join('data','variables.csv'), index_col=False)
vardb.drop('Description', axis=1, inplace=True)

X_full = d01_trs2

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    d01_trs2, SES['SES Ascent 2001-2011'], test_size=0.2, random_state=r_state)

In [None]:
best_clf = ensemble.ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features=0.85, max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=2, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=1400, n_jobs=-1,
          oob_score=False, random_state=42, verbose=0, warm_start=False)

In [None]:
best_clf.fit(X_train, y_train)
y_pred  = best_clf.predict(X_test)

In [None]:
print("Cross-check against full spec of model: ")
print(best_clf.get_params)
print("")

print("Tuned Extra Trees result:")
print(classifier_report(best_clf, y_test, y_pred))
print("")

For the two Extra-Tress Regressors I get:

| Model | R2-Score | MSE | MAE | Explained Variance |
| ----- | -------- | ---- |---- | ------------- |
| Singular Regression | 0.54164 | 0.28576 | 0.33353 | 0.54405 |
| Multiple-Regression | 0.63932 | 0.22486 | 0.30493 | 0.64028 |
| Un-Tuned RF | 0.66722 | 0.20747 | 0.28518 | 0.67187 |
| Single-Tuned RF| 0.69477 | 0.19029 | 0.26110 | 0.69934 |
| Fully-Tuned RF | 0.69739 | 0.18866 | 0.26012 | 0.70101 |

#### Output Graph Representation of 1 Tree

This is used in the article to illustrate how a decision tree within the Random Forest works to split the data so as to make predictions.

In [None]:
from sklearn.tree import export_graphviz

t = best_clf.estimators_[0]
feature_names = X_test.columns.values
export_graphviz(t, out_file=os.path.join(analytical,to_use + "-tree.dot"), filled=True, rounded=True, feature_names=feature_names)
os.system('dot -Tpng ' + os.path.join(analytical,to_use + "-tree.dot") + ' -o ' + os.path.join(analytical,to_use + "-tree.png"))

#### Save Feature Importances

In [None]:
# Create a data frame of feature importance so that we
# can inspect later...
fi = pd.DataFrame.from_dict({'feature':X_test.columns.values, 'importance':best_clf.feature_importances_})

fi = vardb.loc[:,['Predictor','Category']].merge(
    pd.DataFrame.from_dict({'feature':X_test.columns.values, 'importance':best_clf.feature_importances_}), 
    left_on='Predictor', right_on='feature', how='left' )
fi.drop(['feature'], axis=1, inplace=True)
fi.sort_values(by='importance', ascending=False, inplace=True)
fi.to_csv(os.path.join(analytical,to_use+'-Feature_Importance.csv.gz'), compression='gzip', index=False)

print("Feature Importances (5 Biggest):")
print(fi.head(5))

In [None]:
SES = load_status_scores(to_use) 
y_pr = best_clf.predict(X_full)

predicted11 = pd.DataFrame(
    {'lsoacd':        pd.Series(d01_trs2.index), 
     'SES Ascent 2001-2011 (Predicted)': pd.Series(y_pr)})  #  Combine with list of areas
predicted11.set_index('lsoacd', inplace=True)
predicted11.sample(3, random_state=r_state)

In [None]:
predicted11 = predicted11.merge(SES, left_index=True, right_index=True, how='inner')

predicted11['SES 2011 (Predicted)'] = predicted11.loc[:,'SES 2001'] \
                                      + predicted11.loc[:,'SES Ascent 2001-2011 (Predicted)']
predicted11['Score Divergence'] = predicted11.loc[:,'SES 2011 (Predicted)'] \
                                   - predicted11.loc[:,'SES 2011']
predicted11['Ascent Divergence'] = predicted11.loc[:,'SES Ascent 2001-2011 (Predicted)'] \
                                   - predicted11.loc[:,'SES Ascent 2001-2011']

predicted11.sort_index(axis=1, inplace=True)

predicted11.to_csv(os.path.join(analytical,to_use+'-Predicted Ascent 2001-2011.csv.gz'), compression='gzip', index=True)

# Sanity check
print("Results data frame has " + str(predicted11.shape[0]) + " rows.")
predicted11.sample(5, random_state=r_state)

In [None]:
fig = plt.figure('SES Divergence')
sns.distplot(predicted11['Score Divergence'], kde=True)      
fig = plt.gcf() # *G*et the *C*urrent *F*igure environment so that the next command works
plt.savefig("{0}-{1}.pdf".format(to_use, 'SES Ascent 2001-2011-Divergence'), bbox_inches="tight")
plt.close()
print("Done.")

In [None]:
fig = plt.figure('SES Divergence Scatter')
sns.jointplot(x='SES Ascent 2001-2011', y='SES Ascent 2001-2011 (Predicted)', data=predicted11, kind='scatter')     
fig = plt.gcf() # *G*et the *C*urrent *F*igure environment so that the next command works
plt.savefig("{0}-{1}.pdf".format(to_use, 'SES Ascent 2001-2011-Divergence (Scatter)'), bbox_inches="tight")
plt.close()
print("Done.")

## Predicting the _Future_ (2011 > 2021)

The code below this is used to make predictions of 2021.  

In [None]:
#  Make future predictions
y_pred_21 = best_clf.predict(d11_trs2)  #  Make predictions using data from 2011
predicted21 = pd.DataFrame({
        'lsoacd': pd.Series(d11_trs2.index),
        'SES Ascent 2011-2021 (Predicted)': pd.Series(y_pred_21)})
predicted21.set_index('lsoacd', inplace=True)

predicted21.to_csv(os.path.join(analytical,to_use+'-Predicted Ascent 2011-2021.csv.gz'), compression='gzip', index=True)  #  Write results to csv
predicted21.sample(3, random_state=r_state)

In [None]:
pdf = predicted11.merge(predicted21, left_index=True, right_index=True, how='left')  #  Integrate SES 2021 predictions into SES score data
pdf['SES 2021 (Predicted)'] = pdf.loc[:,'SES 2011'] + pdf.loc[:,'SES Ascent 2011-2021 (Predicted)']  # Compute SES score in 2021

#  Compute rank in 2012
pdf['SES 2021 Percentile'] = pdf.loc[:,'SES 2021 (Predicted)'].rank(ascending=True, pct=True)*100

#  Compute change in LSOA ranking from 2011 to 2021
pdf['SES Percentile Ascent 2011-2021'] = pdf.loc[:,'SES 2021 Percentile'] \
                                          - pdf.loc[:,'SES 2011 Percentile']

pdf[['SES 2001','SES 2011','SES 2021 (Predicted)','SES Ascent 2001-2011','SES Ascent 2011-2021 (Predicted)']].sample(3, random_state=r_state)

In [None]:
pdf.columns.values

In [None]:
#  Write results to file
cols = ['SES 2001','SES 2011 (Predicted)','SES 2011','SES 2021 (Predicted)',
        'SES 2001 Percentile','SES 2011 Percentile','SES 2021 Percentile',
        'SES Ascent 2001-2011','SES Ascent 2001-2011 (Predicted)','SES Ascent 2011-2021 (Predicted)',
        'SES Percentile Ascent 2001-2011','SES Percentile Ascent 2011-2021',
        'Score Divergence','Ascent Divergence']

pdf = pdf[cols]
fcols = ['Score Divergence','Ascent Divergence']
pdf.loc[:, fcols] = pdf[fcols].astype(float).applymap('{0:.15f}'.format)
pdf.to_csv(os.path.join(analytical,to_use+'-Predictions.csv.gz'), compression='gzip', index=True)