In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import random
import geopandas as gpd

from torch.utils.data import TensorDataset, DataLoader, Subset
from sklearn.model_selection import StratifiedShuffleSplit
from livelossplot import PlotLosses
from sklearn.metrics import accuracy_score


In [17]:
df = pd.read_csv('./data/2010-2017_30*30网格划分.csv')

In [18]:
df

Unnamed: 0,injection_vol,injection_psi,injection_depth_sum,depth_to_basement,HF_number,HF_Base_Water_Volume,HF_Base_NoWater_Volume,earthquake_occurence
0,0.0,0.0,0.0,0.0,0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0,0.0,0.0,0
...,...,...,...,...,...,...,...,...
895,0.0,0.0,0.0,0.0,0,0.0,0.0,0
896,0.0,0.0,0.0,0.0,0,0.0,0.0,0
897,0.0,0.0,0.0,0.0,0,0.0,0.0,0
898,1732820.0,27600.0,13506.0,0.0,0,0.0,0.0,0


# 初步构建逻辑回归模型

In [19]:
#第一步：导入逻辑回归
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

x_columns = df.columns.tolist()[:-1]

#特征features
X = df[x_columns]
#标签labels
Y = df['earthquake_occurence']

X_train,X_test,y_train,y_test=train_test_split(X,Y,train_size=0.8)

#输出数据大小
print('原始数据特征：',X.shape,
     '训练数据特征：',X_train.shape,
      '测试数据特征',X_test.shape)

print('原始数据标签：',Y.shape,
     '训练数据标签：',y_train.shape,
      '测试数据标签',y_test.shape)

feature_number = len(x_columns)

#将训练数据特征转换成二维数组X行*1列
X_train=X_train.values.reshape(-1,feature_number)
#将测试数据特征转换成二维数组X行*1列
X_test=X_test.values.reshape(-1,feature_number)


#第二步：创建模型：逻辑回归
model=LogisticRegression()
#第三步：训练模型
model.fit(X_train,y_train)

#评估模型：准确率
model.score(X_test,y_test)

原始数据特征： (900, 7) 训练数据特征： (720, 7) 测试数据特征 (180, 7)
原始数据标签： (900,) 训练数据标签： (720,) 测试数据标签 (180,)


0.5888888888888889

# 逐步回归

In [20]:
x_columns = df.columns.tolist()[:-1]
print(feature_names)

['injection_vol', 'injection_psi', 'injection_depth_sum', 'depth_to_basement', 'HF_number', 'HF_Base_Water_Volume', 'HF_Base_NoWater_Volume']


In [21]:
y = df["earthquake_occurence"]

In [22]:
## creating function to get model statistics
import numpy as np
import statsmodels.api as sm
def get_stats():
    x = df[x_columns]
    results = sm.OLS(y, x).fit()
    print(results.summary())

get_stats()

                                  OLS Regression Results                                 
Dep. Variable:     earthquake_occurence   R-squared (uncentered):                   0.306
Model:                              OLS   Adj. R-squared (uncentered):              0.300
Method:                   Least Squares   F-statistic:                              56.15
Date:                  Fri, 23 Jul 2021   Prob (F-statistic):                    1.24e-66
Time:                          00:29:50   Log-Likelihood:                         -712.90
No. Observations:                   900   AIC:                                      1440.
Df Residuals:                       893   BIC:                                      1473.
Df Model:                             7                                                  
Covariance Type:              nonrobust                                                  
                             coef    std err          t      P>|t|      [0.025      0.975]
---------

In [23]:
x_columns.remove("HF_Base_NoWater_Volume")
get_stats()

                                  OLS Regression Results                                 
Dep. Variable:     earthquake_occurence   R-squared (uncentered):                   0.305
Model:                              OLS   Adj. R-squared (uncentered):              0.300
Method:                   Least Squares   F-statistic:                              65.30
Date:                  Fri, 23 Jul 2021   Prob (F-statistic):                    2.66e-67
Time:                          00:29:51   Log-Likelihood:                         -713.51
No. Observations:                   900   AIC:                                      1439.
Df Residuals:                       894   BIC:                                      1468.
Df Model:                             6                                                  
Covariance Type:              nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-----------

In [24]:
x_columns.remove("injection_vol")
get_stats()

                                  OLS Regression Results                                 
Dep. Variable:     earthquake_occurence   R-squared (uncentered):                   0.303
Model:                              OLS   Adj. R-squared (uncentered):              0.299
Method:                   Least Squares   F-statistic:                              77.89
Date:                  Fri, 23 Jul 2021   Prob (F-statistic):                    7.42e-68
Time:                          00:29:52   Log-Likelihood:                         -714.48
No. Observations:                   900   AIC:                                      1439.
Df Residuals:                       895   BIC:                                      1463.
Df Model:                             5                                                  
Covariance Type:              nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-----------

In [25]:
x_columns.remove("HF_Base_Water_Volume")
get_stats()

                                  OLS Regression Results                                 
Dep. Variable:     earthquake_occurence   R-squared (uncentered):                   0.302
Model:                              OLS   Adj. R-squared (uncentered):              0.299
Method:                   Least Squares   F-statistic:                              97.01
Date:                  Fri, 23 Jul 2021   Prob (F-statistic):                    1.34e-68
Time:                          00:29:52   Log-Likelihood:                         -715.12
No. Observations:                   900   AIC:                                      1438.
Df Residuals:                       896   BIC:                                      1457.
Df Model:                             4                                                  
Covariance Type:              nonrobust                                                  
                          coef    std err          t      P>|t|      [0.025      0.975]
------------

# 构建逻辑模型

In [28]:
#第一步：导入逻辑回归
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#特征features
X = df[x_columns]
#标签labels
Y = df['earthquake_occurence']

X_train,X_test,y_train,y_test=train_test_split(X,Y,train_size=0.8)

#输出数据大小
print('原始数据特征：',X.shape,
     '训练数据特征：',X_train.shape,
      '测试数据特征',X_test.shape)

print('原始数据标签：',Y.shape,
     '训练数据标签：',y_train.shape,
      '测试数据标签',y_test.shape)

feature_number = len(x_columns)

#将训练数据特征转换成二维数组X行*1列
X_train=X_train.values.reshape(-1,feature_number)
#将测试数据特征转换成二维数组X行*1列
X_test=X_test.values.reshape(-1,feature_number)


#第二步：创建模型：逻辑回归
model=LogisticRegression()
#第三步：训练模型
model.fit(X_train,y_train)

#评估模型：准确率
model.score(X_test,y_test)

原始数据特征： (900, 4) 训练数据特征： (720, 4) 测试数据特征 (180, 4)
原始数据标签： (900,) 训练数据标签： (720,) 测试数据标签 (180,)


0.6555555555555556

In [29]:
print(x_columns)

['injection_psi', 'injection_depth_sum', 'depth_to_basement', 'HF_number']
