-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis_random_forest.py
95 lines (88 loc) · 3.56 KB
/
analysis_random_forest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 26 18:32:07 2017
@author: kkrao
"""
import os
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dirs import Dir_CA
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import export_graphviz
from sklearn import tree
def rf_assemble(year_range,*dfs):
if len(dfs)==1:
dfs=[dfs]
out=pd.DataFrame(index=range(dfs[0].shape[1]*(year_range[-1]-year_range[0]+1)))
for df in dfs:
df=df[(df.index.year>=year_range[0]) &\
(df.index.year<=year_range[-1])].T
array=pd.Series(df.values.flatten(),name=df.columns.name)
out[array.name]=array
return(out)
def rf_fill_nan(df):
df.fillna(method='bfill',inplace=True)
df.fillna(method='ffill',inplace=True)
return df
def rf_remove_nan(df):
df.dropna(inplace=True)
# df.index=range(df.shape[0])
return df
input_sources=['mortality_025_grid','BPH_025_grid','LAI_025_grid_sum',\
'LAI_025_grid_win','RWC', 'aspect_mean', 'aspect_std', 'canopy_height',\
'cwd','dominant_leaf_habit','elevation_mean','elevation_std',\
'forest_cover','ppt_sum','ppt_win','tmax_sum','tmax_win',\
'tmean_sum','tmean_win','vpdmax_sum','vpdmax_win','EVP_sum',\
'PEVAP_sum','EVP_win','PEVAP_win','vsm_sum','vsm_win']
year_range=range(2009,2016)
#input_sources=['mortality_025_grid','LAI_025_grid_sum',\
#'LAI_025_grid_win','RWC', 'aspect_mean', 'aspect_std', 'canopy_height',\
# 'dominant_leaf_habit','elevation_mean','elevation_std',\
# 'forest_cover','ppt_sum','ppt_win','tmax_sum','tmax_win',\
# 'tmean_sum','tmean_win','vpdmax_sum','vpdmax_win','EVP_sum',\
#'PEVAP_sum','EVP_win','PEVAP_win','vsm_sum','vsm_win']
#year_range=range(2005,2016)
os.chdir(Dir_CA)
store=pd.HDFStore(Dir_CA+'/data.h5')
inputs=range(len(input_sources))
for i in range(len(input_sources)):
inputs[i]=store[input_sources[i]]
Df=rf_assemble(year_range,*inputs)
#Df['missing_data']=Df.T.isnull().sum()
#Df.loc[Df['missing_data']>=1,'missing_data']='yes'
#Df.loc[Df['missing_data']==1,'missing_data']='yes'
#Df.loc[Df['missing_data']==0,'missing_data']='no'
Df=rf_remove_nan(Df)
#Df.to_csv('D:/Krishna/Project/data/rf_data.csv')
#subprocess.call("/usr/bin/Rscript --vanilla /D:/Krishna/Project/codes/rf_model.rmd", shell=True)
#Null analysis-----------------------------------------------------------------
#Null=Df[['RWC','cwd','vsm_sum','vsm_win']]
#Null=Null.isnull()
#Null.replace(True,np.nan,inplace=True)
#for column in Null.columns:
# Null[column].replace(0.0,Null.columns.get_loc(column),inplace=True)
#Null['intersection']=Null.notnull().T.sum()
#Null.intersection[Null.intersection!=4]=np.nan
#Null.intersection[Null.intersection==4]=4
#fig,ax=plt.subplots(figsize=(6,3))
#Null.plot(linestyle='',marker='|',mew='0.1',markersize=10,ax=ax,legend=False,color='darkblue')
##plt.tick_params(axis='y', which='major', labelsize=7)
##ax.set_xlabel('Fraction of Nulls')
#ax.set_ylabel('Features')
#ax.set_title('Data Availability')
#plt.yticks(range(len(Null.columns)),Null.columns)
#labels=range(2009,2016)
#plt.xticks(np.linspace(0,len(Null.index),len(labels),endpoint=False),labels)
#
#
#Null.intersection.replace(False,np.nan,inplace=True)
#Null_frac=(Null.intersection==0).astype(int).mean()
#print('intersection of Nulls = %0.2f'%Null_frac)
#-----------------------------------------------------------------------------
#Df=rf_fill_nan(Df)
#Df['valid_RWC_no_vsm']=0
#Df.loc[(Df.RWC.notnull() & (Df.vsm_sum.isnull() | Df.vsm_win.isnull())),'valid_RWC_no_vsm' ] = 1
#Df['valid_RWC_no_vsm'].sum()/Df.shape[0]*100