In [1]:
import numpy as np
import pandas as pd
from scipy import stats, optimize
import pymannkendall as mk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing, metrics, linear_model, ensemble, feature_selection, model_selection

In [2]:
df_agg_all = pd.read_csv('../data/df_agg_all.csv')
print(df_agg_all.shape)
print(list(df_agg_all.columns))
df_agg_all.head()

(4578, 16)
['city', 'loc_x', 'loc_y', 'ui_mean', 'wi_mean', 'wi_mean_h', 'wi_mean_ll', 'wi_mean_ndvi_hh', 'Vv', 'Vn', 'wd_mean', 'r2', 'year_2stage', 'cz_0', 'cz_name_0', 'year']


Unnamed: 0,city,loc_x,loc_y,ui_mean,wi_mean,wi_mean_h,wi_mean_ll,wi_mean_ndvi_hh,Vv,Vn,wd_mean,r2,year_2stage,cz_0,cz_name_0,year
0,1,172.625487,-43.528627,0.593366,-6.673136,2.859704,-12.61954,-0.845661,0.933527,0.476416,-0.24483,0.59474,2001 - 2009,2,Temperate,2002
1,1,172.62562,-43.528701,0.614727,-5.891544,4.418573,-11.046365,-0.839819,0.932336,0.476416,-0.244504,0.61025,2001 - 2009,2,Temperate,2005
2,1,172.625678,-43.528725,0.622018,-6.140499,4.140212,-11.290951,-0.990048,0.941127,0.476416,-0.246891,0.660816,2001 - 2009,2,Temperate,2008
3,1,172.625616,-43.528948,0.630632,-4.153341,4.133229,-6.687358,-1.619324,0.874927,0.476416,-0.22774,0.516292,2010 - 2018,2,Temperate,2011
4,1,172.625862,-43.529062,0.64728,-0.096787,4.350343,0.933541,-1.106507,0.822976,0.476416,-0.210553,0.587813,2010 - 2018,2,Temperate,2014


## Air temperature

In [3]:
df_city_airT_all = pd.DataFrame()
for year in [2002, 2005, 2008, 2011, 2014, 2017]:
    df_airT_urban = pd.read_csv('../data/urban_factors/urban_data_airT_{}.csv'.format(year))[['OBJECTID', 'mean']]
    df_airT_urban.columns = ['city_id', 'airT_urban']
    df_airT_urban['airT_urban'] = df_airT_urban['airT_urban'] - 273.15

    df_airT_rural = pd.read_csv('../data/urban_factors/rural_data_airT_{}.csv'.format(year))[['OBJECTID', 'mean']]
    df_airT_rural.columns = ['city_id', 'airT_rural']
    df_airT_rural['airT_rural'] = df_airT_rural['airT_rural'] - 273.15
    
    df_city_airT = pd.merge(left=df_airT_urban, right=df_airT_rural, on='city_id')
    df_city_airT['airT_diff'] = df_city_airT['airT_urban'] - df_city_airT['airT_rural']
    df_city_airT['year'] = year
    df_city_airT_all = pd.concat([df_city_airT_all, df_city_airT], axis=0).reset_index(drop=True)
print(df_city_airT_all.shape)

(4758, 5)


In [4]:
merge_id_list = []
for i in range(len(df_agg_all)):
    merge_id_list.append('{}_{}'.format(df_agg_all['city'][i], df_agg_all['year'][i]))
df_agg_all['merge_id'] = merge_id_list

merge_id_list = []
for i in range(len(df_city_airT_all)):
    merge_id_list.append('{}_{}'.format(df_city_airT_all['city_id'][i], df_city_airT_all['year'][i]))
df_city_airT_all['merge_id'] = merge_id_list

df_agg_all = pd.merge(left=df_agg_all, right=df_city_airT_all, on='merge_id', how='left')
df_agg_all = df_agg_all.drop('merge_id', axis=1).reset_index(drop=True)
df_agg_all['year'] = df_agg_all['year_x']
df_agg_all = df_agg_all.drop('year_x', axis=1).reset_index(drop=True)
df_agg_all = df_agg_all.drop('year_y', axis=1).reset_index(drop=True)
df_agg_all = df_agg_all.drop('city_id', axis=1).reset_index(drop=True)
print(df_agg_all.shape)
print(list(df_agg_all.columns))

(4578, 19)
['city', 'loc_x', 'loc_y', 'ui_mean', 'wi_mean', 'wi_mean_h', 'wi_mean_ll', 'wi_mean_ndvi_hh', 'Vv', 'Vn', 'wd_mean', 'r2', 'year_2stage', 'cz_0', 'cz_name_0', 'airT_urban', 'airT_rural', 'airT_diff', 'year']


## Preicipitation

In [5]:
df_city_pre_all = pd.DataFrame()
for year in [2002, 2005, 2008, 2011, 2014, 2017]:
    df_city_pre = pd.read_csv('../data/urban_factors/global_city_preci_{}.csv'.format(year))[['OBJECTID', 'mean']]
    df_city_pre.columns = ['city_id', 'pre']
    df_city_pre['pre'] = df_city_pre['pre'] * 1000 / 3.0
    df_city_pre['year'] = year
    df_city_pre_all = pd.concat([df_city_pre_all, df_city_pre], axis=0).reset_index(drop=True)
print(df_city_pre_all.shape)

(4758, 3)


In [6]:
merge_id_list = []
for i in range(len(df_agg_all)):
    merge_id_list.append('{}_{}'.format(df_agg_all['city'][i], df_agg_all['year'][i]))
df_agg_all['merge_id'] = merge_id_list

merge_id_list = []
for i in range(len(df_city_pre_all)):
    merge_id_list.append('{}_{}'.format(df_city_pre_all['city_id'][i], df_city_pre_all['year'][i]))
df_city_pre_all['merge_id'] = merge_id_list

df_agg_all = pd.merge(left=df_agg_all, right=df_city_pre_all, on='merge_id', how='left')
df_agg_all = df_agg_all.drop('merge_id', axis=1).reset_index(drop=True)
df_agg_all['year'] = df_agg_all['year_x']
df_agg_all = df_agg_all.drop('year_x', axis=1).reset_index(drop=True)
df_agg_all = df_agg_all.drop('year_y', axis=1).reset_index(drop=True)
df_agg_all = df_agg_all.drop('city_id', axis=1).reset_index(drop=True)
print(df_agg_all.shape)
print(list(df_agg_all.columns))

(4578, 20)
['city', 'loc_x', 'loc_y', 'ui_mean', 'wi_mean', 'wi_mean_h', 'wi_mean_ll', 'wi_mean_ndvi_hh', 'Vv', 'Vn', 'wd_mean', 'r2', 'year_2stage', 'cz_0', 'cz_name_0', 'airT_urban', 'airT_rural', 'airT_diff', 'pre', 'year']


## LST

In [7]:
df_city_lst_all = pd.DataFrame()
for year in [2002, 2005, 2008, 2011, 2014, 2017]:
    df_lst_urban = pd.read_csv('../data/urban_factors/urban_data_LST_{}.csv'.format(year))[['OBJECTID', 'mean']]
    df_lst_urban.columns = ['city_id', 'lst_urban']
    df_lst_urban['lst_urban'] = df_lst_urban['lst_urban'] * 0.02 - 273.15

    df_lst_rural = pd.read_csv('../data/urban_factors/rural_data_LST_{}.csv'.format(year))[['OBJECTID', 'mean']]
    df_lst_rural.columns = ['city_id', 'lst_rural']
    df_lst_rural['lst_rural'] = df_lst_rural['lst_rural'] * 0.02 - 273.15
    
    df_city_lst = pd.merge(left=df_lst_urban, right=df_lst_rural, on='city_id')
    df_city_lst['lst_diff'] = df_city_lst['lst_urban'] - df_city_lst['lst_rural']
    df_city_lst['year'] = year
    df_city_lst_all = pd.concat([df_city_lst_all, df_city_lst], axis=0).reset_index(drop=True)
print(df_city_lst_all.shape)

(4758, 5)


In [8]:
merge_id_list = []
for i in range(len(df_agg_all)):
    merge_id_list.append('{}_{}'.format(df_agg_all['city'][i], df_agg_all['year'][i]))
df_agg_all['merge_id'] = merge_id_list

merge_id_list = []
for i in range(len(df_city_lst_all)):
    merge_id_list.append('{}_{}'.format(df_city_lst_all['city_id'][i], df_city_lst_all['year'][i]))
df_city_lst_all['merge_id'] = merge_id_list

df_agg_all = pd.merge(left=df_agg_all, right=df_city_lst_all, on='merge_id', how='left')
df_agg_all = df_agg_all.drop('merge_id', axis=1).reset_index(drop=True)
df_agg_all['year'] = df_agg_all['year_x']
df_agg_all = df_agg_all.drop('year_x', axis=1).reset_index(drop=True)
df_agg_all = df_agg_all.drop('year_y', axis=1).reset_index(drop=True)
df_agg_all = df_agg_all.drop('city_id', axis=1).reset_index(drop=True)
print(df_agg_all.shape)
print(list(df_agg_all.columns))

(4578, 23)
['city', 'loc_x', 'loc_y', 'ui_mean', 'wi_mean', 'wi_mean_h', 'wi_mean_ll', 'wi_mean_ndvi_hh', 'Vv', 'Vn', 'wd_mean', 'r2', 'year_2stage', 'cz_0', 'cz_name_0', 'airT_urban', 'airT_rural', 'airT_diff', 'pre', 'lst_urban', 'lst_rural', 'lst_diff', 'year']


## Urban greenness

In [9]:
ui_thred = 0.1
df_all = pd.read_csv('../data/df_all.csv')
df_all_sub = df_all[df_all['UI'] > ui_thred].reset_index(drop=True)
df_city_greeness = df_all_sub.groupby(['city_id', 'year'], as_index=False).agg({'EVI': np.mean})
df_city_greeness.columns = ['city_id', 'year', 'urban_greenness']

In [10]:
merge_id_list = []
for i in range(len(df_agg_all)):
    merge_id_list.append('{}_{}'.format(df_agg_all['city'][i], df_agg_all['year'][i]))
df_agg_all['merge_id'] = merge_id_list

merge_id_list = []
for i in range(len(df_city_greeness)):
    merge_id_list.append('{}_{}'.format(df_city_greeness['city_id'][i], df_city_greeness['year'][i]))
df_city_greeness['merge_id'] = merge_id_list

df_agg_all = pd.merge(left=df_agg_all, right=df_city_greeness, on='merge_id', how='left')
df_agg_all = df_agg_all.drop('merge_id', axis=1).reset_index(drop=True)
df_agg_all['year'] = df_agg_all['year_x']
df_agg_all = df_agg_all.drop('year_x', axis=1).reset_index(drop=True)
df_agg_all = df_agg_all.drop('year_y', axis=1).reset_index(drop=True)
df_agg_all = df_agg_all.drop('city_id', axis=1).reset_index(drop=True)
print(df_agg_all.shape)
print(list(df_agg_all.columns))

(4578, 24)
['city', 'loc_x', 'loc_y', 'ui_mean', 'wi_mean', 'wi_mean_h', 'wi_mean_ll', 'wi_mean_ndvi_hh', 'Vv', 'Vn', 'wd_mean', 'r2', 'year_2stage', 'cz_0', 'cz_name_0', 'airT_urban', 'airT_rural', 'airT_diff', 'pre', 'lst_urban', 'lst_rural', 'lst_diff', 'urban_greenness', 'year']


## Population density

In [11]:
df_city_pop_all = pd.DataFrame()
dict_pop_year = {2002: 2000, 2005: 2005, 2008: 2005, 2011: 2010, 2014: 2015, 2017: 2020}
for year in [2002, 2005, 2008, 2011, 2014, 2017]:
    df_city_pop = pd.read_csv('../data/urban_factors/urban_data_pop_density_{}.csv'.format(dict_pop_year[year]))[['OBJECTID', 'mean']]
    df_city_pop.columns = ['city_id', 'pop_density']
    df_city_pop['pop_density_log'] = np.log10(df_city_pop['pop_density'])
    df_city_pop['year'] = year
    df_city_pop_all = pd.concat([df_city_pop_all, df_city_pop], axis=0).reset_index(drop=True)
print(df_city_pop_all.shape)

(4758, 4)


In [12]:
merge_id_list = []
for i in range(len(df_agg_all)):
    merge_id_list.append('{}_{}'.format(df_agg_all['city'][i], df_agg_all['year'][i]))
df_agg_all['merge_id'] = merge_id_list

merge_id_list = []
for i in range(len(df_city_pop_all)):
    merge_id_list.append('{}_{}'.format(df_city_pop_all['city_id'][i], df_city_pre_all['year'][i]))
df_city_pop_all['merge_id'] = merge_id_list

df_agg_all = pd.merge(left=df_agg_all, right=df_city_pop_all, on='merge_id', how='left')
df_agg_all = df_agg_all.drop('merge_id', axis=1).reset_index(drop=True)
df_agg_all['year'] = df_agg_all['year_x']
df_agg_all = df_agg_all.drop('year_x', axis=1).reset_index(drop=True)
df_agg_all = df_agg_all.drop('year_y', axis=1).reset_index(drop=True)
df_agg_all = df_agg_all.drop('city_id', axis=1).reset_index(drop=True)
print(df_agg_all.shape)
print(list(df_agg_all.columns))

(4578, 26)
['city', 'loc_x', 'loc_y', 'ui_mean', 'wi_mean', 'wi_mean_h', 'wi_mean_ll', 'wi_mean_ndvi_hh', 'Vv', 'Vn', 'wd_mean', 'r2', 'year_2stage', 'cz_0', 'cz_name_0', 'airT_urban', 'airT_rural', 'airT_diff', 'pre', 'lst_urban', 'lst_rural', 'lst_diff', 'urban_greenness', 'pop_density', 'pop_density_log', 'year']


## Trends of variables

In [13]:
df_city_wi_trend = pd.DataFrame()
temp_trend_slope_list = []
temp_trend_p_list = []
pre_trend_slope_list = []
pre_trend_p_list = []
ui_trend_slope_list = []
ui_trend_p_list = []
ui_trend_slope_list = []
ui_trend_p_list = []
wi_trend_slope_list = []
wi_trend_p_list = []
vi_mean_trend_slope_list = []
vi_mean_trend_p_list = []
uhi_trend_slope_list = []
uhi_trend_p_list = []
pop_trend_slope_list = []
pop_trend_p_list = []
city_id_list = list(np.unique(df_agg_all['city']))
for city_id in city_id_list:
    df_one_city = df_agg_all[df_agg_all['city'] == city_id].reset_index(drop=True).sort_values('year')

    mk_res = mk.original_test(df_one_city['airT_urban'])
    slope_temp = mk_res.slope / 3.0
    p_value_temp = mk_res.p
    mk_res = mk.original_test(df_one_city['pre'])
    slope_pre = mk_res.slope / 3.0
    p_value_pre = mk_res.p
    mk_res = mk.original_test(df_one_city['ui_mean'])
    slope_ui = mk_res.slope / 3.0
    p_value_ui = mk_res.p
    mk_res = mk.original_test(df_one_city['wi_mean'])
    slope_ndvi = mk_res.slope / 3.0
    p_value_ndvi = mk_res.p
    mk_res = mk.original_test(df_one_city['urban_greenness'])
    slope_vi_mean = mk_res.slope / 3.0
    p_value_vi_mean = mk_res.p
    mk_res = mk.original_test(df_one_city['lst_diff'])
    slope_uhi = mk_res.slope / 3.0
    p_value_uhi = mk_res.p
    mk_res = mk.original_test(df_one_city['pop_density_log'])
    slope_pop = mk_res.slope / 3.0
    p_value_pop = mk_res.p

    temp_trend_slope_list.append(slope_temp)
    temp_trend_p_list.append(p_value_temp)
    pre_trend_slope_list.append(slope_pre)
    pre_trend_p_list.append(p_value_pre)
    ui_trend_slope_list.append(slope_ui)
    ui_trend_p_list.append(p_value_ui)
    wi_trend_slope_list.append(slope_ndvi)
    wi_trend_p_list.append(p_value_ndvi)
    vi_mean_trend_slope_list.append(slope_vi_mean)
    vi_mean_trend_p_list.append(p_value_vi_mean)
    uhi_trend_slope_list.append(slope_uhi)
    uhi_trend_p_list.append(p_value_uhi)
    pop_trend_slope_list.append(slope_pop)
    pop_trend_p_list.append(p_value_pop)
df_city_wi_trend['city_id'] = city_id_list
df_city_wi_trend['temp_trend_slope'] = temp_trend_slope_list
df_city_wi_trend['temp_trend_p'] = temp_trend_p_list
df_city_wi_trend['pre_trend_slope'] = pre_trend_slope_list
df_city_wi_trend['pre_trend_p'] = pre_trend_p_list
df_city_wi_trend['ui_trend_slope'] = ui_trend_slope_list
df_city_wi_trend['ui_trend_p'] = ui_trend_p_list
df_city_wi_trend['wi_trend_slope_ndvi'] = wi_trend_slope_list
df_city_wi_trend['wi_trend_p_ndvi'] = wi_trend_p_list
df_city_wi_trend['vi_mean_trend_slope'] = vi_mean_trend_slope_list
df_city_wi_trend['vi_mean_trend_slope'] = vi_mean_trend_p_list
df_city_wi_trend['uhi_trend_slope'] = uhi_trend_slope_list
df_city_wi_trend['uhi_trend_p'] = uhi_trend_p_list
df_city_wi_trend['pop_trend_slope'] = pop_trend_slope_list
df_city_wi_trend['pop_trend_p'] = pop_trend_p_list

In [14]:
df_agg_all = pd.merge(left=df_agg_all, right=df_city_wi_trend, left_on='city', right_on='city_id', how='left')
df_agg_all = df_agg_all.drop('city_id', axis=1).reset_index(drop=True)
print('Varible names:\n{}'.format(list(df_agg_all.columns)))
df_agg_all[df_agg_all['city'] == 738]  # Show results of Beijing as an example

Varible names:
['city', 'loc_x', 'loc_y', 'ui_mean', 'wi_mean', 'wi_mean_h', 'wi_mean_ll', 'wi_mean_ndvi_hh', 'Vv', 'Vn', 'wd_mean', 'r2', 'year_2stage', 'cz_0', 'cz_name_0', 'airT_urban', 'airT_rural', 'airT_diff', 'pre', 'lst_urban', 'lst_rural', 'lst_diff', 'urban_greenness', 'pop_density', 'pop_density_log', 'year', 'temp_trend_slope', 'temp_trend_p', 'pre_trend_slope', 'pre_trend_p', 'ui_trend_slope', 'ui_trend_p', 'wi_trend_slope_ndvi', 'wi_trend_p_ndvi', 'vi_mean_trend_slope', 'uhi_trend_slope', 'uhi_trend_p', 'pop_trend_slope', 'pop_trend_p']


Unnamed: 0,city,loc_x,loc_y,ui_mean,wi_mean,wi_mean_h,wi_mean_ll,wi_mean_ndvi_hh,Vv,Vn,...,pre_trend_p,ui_trend_slope,ui_trend_p,wi_trend_slope_ndvi,wi_trend_p_ndvi,vi_mean_trend_slope,uhi_trend_slope,uhi_trend_p,pop_trend_slope,pop_trend_p
4248,738,116.423459,39.902678,0.498411,24.395656,41.6008,9.872876,38.62798,0.407068,0.063574,...,1.0,0.02156,0.008535,2.752431,0.008535,0.024171,0.040267,0.060289,0.021324,0.012899
4249,738,116.424798,39.903501,0.57694,35.312127,67.1687,11.979189,58.178407,0.435082,0.063574,...,1.0,0.02156,0.008535,2.752431,0.008535,0.024171,0.040267,0.060289,0.021324,0.012899
4250,738,116.425595,39.903428,0.643324,49.876378,103.684814,13.280724,85.740118,0.471003,0.063574,...,1.0,0.02156,0.008535,2.752431,0.008535,0.024171,0.040267,0.060289,0.021324,0.012899
4251,738,116.42529,39.904492,0.708005,56.791166,122.052013,13.551082,99.166448,0.459971,0.063574,...,1.0,0.02156,0.008535,2.752431,0.008535,0.024171,0.040267,0.060289,0.021324,0.012899
4252,738,116.424416,39.905052,0.772082,57.424826,133.730613,8.871389,105.007195,0.519402,0.063574,...,1.0,0.02156,0.008535,2.752431,0.008535,0.024171,0.040267,0.060289,0.021324,0.012899
4253,738,116.42293,39.906322,0.812057,66.047393,155.582863,10.339988,120.640648,0.581412,0.063574,...,1.0,0.02156,0.008535,2.752431,0.008535,0.024171,0.040267,0.060289,0.021324,0.012899
