In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import metrics, preprocessing

In [2]:
def bin_values(xs, ys, interval=0.01, range_min=0.0, range_max=1.0, method='mean', thred_min_val_num=10):
    xs_bins = []
    ys_bins = []
    xs = np.array(xs)
    ys = np.array(ys)
    for start in np.arange(range_min, range_max, interval):
        values = ys[np.where((xs >= start) & (xs <= start+interval))]
        if len(values) < thred_min_val_num:
            continue
        xs_bins.append(start + 0.5 * interval)
        if method == 'mean':
            ys_bins.append(np.mean(values))
        elif method == 'median':
            ys_bins.append(np.median(values))
        else:
            ys_bins.append(np.mean(values))
    return np.array(xs_bins), np.array(ys_bins)

def bin_values_with_std(xs, ys, interval=0.01, range_min=0.0, range_max=1.0, method='mean', thred_min_val_num=10):
    xs_bins = []
    ys_bins = []
    std_bins = []
    for start in np.arange(range_min, range_max, interval):
        values = ys[np.where((xs >= start) & (xs <= start+interval))]
        if len(values) < thred_min_val_num:
            continue
        xs_bins.append(start + 0.5 * interval)
        if method == 'mean':
            ys_bins.append(np.mean(values))
        elif method == 'median':
            ys_bins.append(np.median(values))
        else:
            ys_bins.append(np.mean(values))
        std_bins.append(np.std(values))
    return np.array(xs_bins), np.array(ys_bins), np.array(std_bins)

def get_idx_without_outliers(data, m=3):
    valid_idx = np.where(abs(data - np.mean(data)) < m * np.std(data))
    return valid_idx

def get_df_all(year_list=[2002, 2005, 2008, 2011, 2014, 2017]):
    df_city_info = pd.read_csv('../data/df_city_info.csv')
    df_all = pd.DataFrame()
    for year in year_list:
        df_one_year = pd.read_csv('../data/df_{}.csv'.format(year))
        df_one_year = df_one_year[['city_id', 'lon', 'lat', 'EVI', 'UI']]
        df_one_year['year'] = year
        df_all = pd.concat([df_all, df_one_year], axis=0)
    df_all = df_all[df_all['UI'] > 0].reset_index(drop=True)
    df_all = df_all.reset_index(drop=True)
    valid_idx_list = get_idx_without_outliers(df_all['EVI'], m=5)
    df_all = df_all.iloc[valid_idx_list].reset_index(drop=True)
    df_all['EVI'] = preprocessing.minmax_scale(df_all['EVI'])
    df_all = pd.merge(left=df_all, right=df_city_info[['city_id', 'cz', 'cz_name']], on='city_id', how='left')
    return df_all


In [3]:
def calc_ui_vi(ui_list, vi_list, interval=0.01, Vn=None):
    """
    Function for generate the relationship between VI (EVI) and UI (urbanization intensity).
    The zero-impact line can be generated by linking values of Vv and Vn.
    The indirect impact (Wi) can be calculated by the difference between fitted VI values and VI on the zero-impact line.
    Args:
        ui_list: the UI values in a city.
        vi_list: the VI values in a city.
        interval: the interval for binning the data points along UI from 0 to 1.
    """
    ui_bins_list, vi_bins_list = bin_values(ui_list, vi_list, interval=interval, thred_min_val_num=1)
    deg = 3
    reg = np.polyfit(ui_bins_list, vi_bins_list, deg=deg)
    x_fit_list = np.arange(np.min(ui_list), np.max(ui_list), 0.01)
    y_fit_list = np.polyval(reg, x_fit_list)
    vi_bins_list_fit = np.polyval(reg, ui_bins_list)
    r2 = metrics.r2_score(vi_bins_list, vi_bins_list_fit)
    # print('R squre = {:.3f}'.format(r2))

    Vv_reg = np.polyval(reg, 0)
    V_obs_max = vi_bins_list[0]
    Vv = Vv_reg
    ui_max = np.max(ui_list)
    Vn_reg = np.polyval(reg, 1)
    if Vn is None:
        Vn = max(Vn_reg, 0.05)
    # print('Vv = {:.3f}\tVn = {:.3f}'.format(Vv, Vn))
    
    vz_list = []
    v_diff_list = []
    for i in range(len(ui_list)):
        ui = ui_list[i]
        vz = (Vn - Vv) * ui + Vv
        vz_list.append(vz)
        v_diff_list.append(vi_list[i] - vz)
    vz_list = np.array(vz_list)
    v_diff_list = np.array(v_diff_list)

    wd_list = []
    wi_list = []
    ui_list_w = []
    for i in range(len(vz_list)):
        vz = vz_list[i]
        vobs = vi_list[i]
        if vz <= 0.001:
            continue
        wd = (vz - Vv) / Vv * 100
        wi = (vobs - vz) / vz * 100
        wd_list.append(wd)
        wi_list.append(wi)
        ui_list_w.append(ui_list[i])
    ui_list_w = np.array(ui_list_w)
    wd_list = np.array(wd_list)
    wi_list = np.array(wi_list)
    
    if len(ui_list) < 500:
        interval = interval*5
    ui_bins_list_wi, wi_bins_list = bin_values(ui_list_w, wi_list, interval=interval, method='mean', thred_min_val_num=1)
    
    return ui_bins_list, vi_bins_list, x_fit_list, y_fit_list, Vv, Vn, wi_bins_list, r2

In [4]:
# Filter cities with insufficient data points
valid_city_ids = []
year_list = [2002, 2005, 2008, 2011, 2014, 2017]
df = get_df_all(year_list=year_list)
for city_id in np.unique(df['city_id']):
    df_city = df[df['city_id'] == city_id].reset_index(drop=True)
    flag = True
    for year in year_list:
        df_city_year = df_city[df_city['year'] == year].reset_index(drop=True)
        if len(df_city_year) >= 200:
            ui_list = list(df_city_year['UI'])
            if np.max(ui_list) < 0.9:
                print('City with insufficient data: <{}>'.format(city_id))
                flag = False
                break
            elif np.min(ui_list) > 0.2:
                print('City with insufficient data: <{}>'.format(city_id))
                flag = False
                break
            else:
                flag = True
        else:
            print('City with insufficient data: <{}>'.format(city_id))
            flag = False
            break
    if flag:
        valid_city_ids.append(city_id)
print(len(valid_city_ids))

df = df[df['city_id'].isin(valid_city_ids)].reset_index(drop=True)
print(df.shape)
print('city count = {}'.format(len(np.unique(df['city_id']))))

City with insufficient data: <169>
City with insufficient data: <195>
City with insufficient data: <199>
City with insufficient data: <208>
City with insufficient data: <214>
City with insufficient data: <223>
City with insufficient data: <362>
City with insufficient data: <370>
City with insufficient data: <595>
City with insufficient data: <600>
782
(22311856, 8)
city count = 782


In [6]:
# Calculate Vv, Vn, Wi, Wd, and R2 of fitted VI~UI curve for each city
year_list_ = []
city_name_list_ = []
loc_x_list = []
loc_y_list = []
cz_list = []
ui_mean_list = []
wi_mean_list = []
Vv_list = []
Vn_list = []
wd_mean_list = []
wi_mean_ll_list = []
wi_mean_hh_list = []
r2_list = []

city_ids = valid_city_ids
for city_id in city_ids:
    # print('City ID: {}'.format(city_id))

    df_city_multiyear = df[df['city_id'] == city_id].reset_index(drop=True)
    if len(df_city_multiyear[df_city_multiyear['UI'] >= 0.9]) < 50:
        continue
    
    Vn_city_list = []
    for year in year_list:
        df_city = df_city_multiyear[df_city_multiyear['year'] == year].reset_index(drop=True)
        evi_data = np.array(df_city[df_city['UI'] >= 0.9]['EVI'])
        Vn_city_year = np.percentile(evi_data, 25)
        Vn_city_list.append(Vn_city_year)
    Vn_city = np.min(Vn_city_list)

    for year in year_list:
        # print('Year: {}'.format(year))
        df_city = df_city_multiyear[df_city_multiyear['year'] == year].reset_index(drop=True)
        try:
            ui_list = np.array(df_city['UI'])
            vi_evi_list = np.array(df_city['EVI'])
            valid_idx_ui_list = np.where(ui_list > 0.0)[0]
            valid_idx_evi_list = np.where(np.invert(np.isnan(vi_evi_list)))[0]
            valid_idx_evi_list = np.array(list(set(valid_idx_ui_list).intersection(set(valid_idx_evi_list))))
            ui_evi_list = ui_list[valid_idx_evi_list]
            vi_evi_list = vi_evi_list[valid_idx_evi_list]
            ui_bins_list, vi_bins_list, x_fit_list, y_fit_list, Vv, Vn, wi_bins_list, r2 = calc_ui_vi(ui_list=ui_evi_list, vi_list=vi_evi_list, interval=0.01, Vn=Vn_city)
            wd_mean = (((Vv + Vn) / 2.0) - Vv) / Vv
            wi_mean = np.mean(wi_bins_list)
            bins_len = len(wi_bins_list)
            wi_mean_ll = np.mean(wi_bins_list[:int(bins_len/2.0)])
            wi_mean_hh = np.mean(wi_bins_list[int(bins_len/2.0):])
        except:
            # print('ERROR! <city_id: {}>'.format(city_id))
            continue
        loc_x = np.mean(df_city['lon'])
        loc_y = np.mean(df_city['lat'])
        cz = max(list(df_city['cz']), key=list(df_city['cz']).count)
        year_list_.append(year)
        city_name_list_.append(city_id)
        cz_list.append(cz)
        loc_x_list.append(loc_x)
        loc_y_list.append(loc_y)
        Vv_list.append(Vv)
        Vn_list.append(Vn)
        ui_mean_list.append(np.mean(ui_evi_list))
        wi_mean_list.append(wi_mean)        
        wd_mean_list.append(wd_mean)
        wi_mean_ll_list.append(wi_mean_ll)
        wi_mean_hh_list.append(wi_mean_hh)
        r2_list.append(r2)
print('Calculate indirect impacts for all cities finished.')

Calculate indirect impacts for all cities finished.


In [7]:
# Generate a data frame for storing the results in all cities
df_agg_all = pd.DataFrame(
    {
        'year': year_list_,
        'city': city_name_list_,
        'cz': cz_list,
        'loc_x': loc_x_list,
        'loc_y': loc_y_list,
        'ui_mean': ui_mean_list,
        'wi_mean': wi_mean_list,
        'wd_mean': wd_mean_list,
        'r2': r2_list
    }
)
df_agg_all[df_agg_all['city'] == 738]   # Show resuts for Beijing (City ID: 738)

Unnamed: 0,year,city,cz,loc_x,loc_y,ui_mean,wi_mean,wd_mean,r2
4350,2002,738,3,116.418514,39.855479,0.272997,5.144568,-0.201265,0.996513
4351,2005,738,3,116.422544,39.856021,0.31543,7.313498,-0.214469,0.995558
4352,2008,738,3,116.427919,39.853076,0.349913,11.008801,-0.224012,0.994176
4353,2011,738,3,116.430302,39.8507,0.3888,12.410854,-0.221879,0.992086
4354,2014,738,3,116.432472,39.846889,0.428973,14.76953,-0.22792,0.991647
4355,2017,738,3,116.433273,39.845403,0.469492,17.653572,-0.240845,0.989083


In [12]:
df_r2 = df_agg_all.groupby('city', as_index=False).agg({'r2': 'min'})
city_id_valid_list = df_r2[df_r2['r2'] > 0.6]['city']
df_agg_all_final = df_agg_all[df_agg_all['city'].isin(city_id_valid_list)].reset_index(drop=True)
print(len(np.unique(df_agg_all_final['city'])))

672
