In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import geopandas as gpd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import os

# Parent dir
parent_dir = "/content/drive/MyDrive"
os.makedirs(parent_dir, exist_ok=True)
os.chdir(parent_dir)
print("当前工作目录：", os.getcwd())

当前工作目录： /content/drive/MyDrive


In [4]:
elderly_density_path = 'Capstone/IMPORTANT_DATA/pop_density.csv'
accessibility_path='Capstone/IMPORTANT_DATA/accessibility_data.csv'
visiting_numbers_path = 'Capstone/IMPORTANT_DATA/parks_visit_count.csv'
poi_diversity_path = 'Capstone/IMPORTANT_DATA/poi_diversity.csv'

# 加载每个 CSV 文件
elderly_density_df = pd.read_csv(elderly_density_path)
accessibility_df = pd.read_csv(accessibility_path)
visiting_numbers_df = pd.read_csv(visiting_numbers_path)
poi_density_df = pd.read_csv(poi_diversity_path)

In [5]:
parks_gdf = gpd.read_file('Capstone/IMPORTANT_DATA/park_wth_label.csv')
parks_gdf = parks_gdf[["OBJECTID","Shape_Area","2sfca_40_Shape_Area","accessibility_label"]]
parks_gdf

Unnamed: 0,OBJECTID,Shape_Area,2sfca_40_Shape_Area,accessibility_label
0,56,9.85462e-06,2.542174214595406,0
1,57,6.92055e-06,2.372510354112305,0
2,129,9.35e-07,2.679409268283937,1
3,246,4.5142e-07,1.5383810652543626,0
4,247,9.9224e-07,1.54482587030411,0
...,...,...,...,...
105,5513,1.1019e-07,2.2607774403273484,0
106,5514,1.1579e-07,2.2542079071703363,0
107,6469,2.9402e-07,1.8039887002309491,0
108,9370,1.23951e-06,1.766042731965559,0


In [6]:
parks_gdf['OBJECTID'] = parks_gdf['OBJECTID'].astype('int64')
accessibility_df['OBJECTID'] = accessibility_df['OBJECTID'].astype('int64')
elderly_density_df['OBJECTID'] = elderly_density_df['OBJECTID'].astype('int64')
visiting_numbers_df['OBJECTID'] = visiting_numbers_df['OBJECTID'].astype('int64')
poi_density_df['OBJECTID'] = poi_density_df['OBJECTID'].astype('int64')

In [7]:
merged_df = parks_gdf.merge(accessibility_df, on='OBJECTID', how='inner')
merged_df = merged_df.merge(elderly_density_df, on='OBJECTID', how='inner')
merged_df = merged_df.merge(visiting_numbers_df, on='OBJECTID', how='inner')
merged_df = merged_df.merge(poi_density_df, on='OBJECTID', how='inner')

merged_df = merged_df.drop(columns=['R_j'])
merged_df

Unnamed: 0,OBJECTID,Shape_Area,2sfca_40_Shape_Area,accessibility_label,C_j,population_density,visit_count,poi_entropy_500m
0,56,9.85462e-06,2.542174214595406,0,198.444303,4323.209232,38,2.144317
1,57,6.92055e-06,2.372510354112305,0,46.696943,6486.324482,33,2.206376
2,129,9.35e-07,2.679409268283937,1,4662.802711,4341.720982,26,2.283983
3,246,4.5142e-07,1.5383810652543626,0,5287.586959,7487.630415,27,2.254954
4,247,9.9224e-07,1.54482587030411,0,4018.269810,7184.962137,25,2.252107
...,...,...,...,...,...,...,...,...
105,5513,1.1019e-07,2.2607774403273484,0,341.733049,3637.379818,25,2.225537
106,5514,1.1579e-07,2.2542079071703363,0,292.091011,4306.456010,23,2.192877
107,6469,2.9402e-07,1.8039887002309491,0,133.635944,5528.125035,23,2.379101
108,9370,1.23951e-06,1.766042731965559,0,164.687448,10338.352359,31,2.328584


In [8]:
merged_df['accessibility_label'] = merged_df['accessibility_label'].astype('int64')
merged_df

Unnamed: 0,OBJECTID,Shape_Area,2sfca_40_Shape_Area,accessibility_label,C_j,population_density,visit_count,poi_entropy_500m
0,56,9.85462e-06,2.542174214595406,0,198.444303,4323.209232,38,2.144317
1,57,6.92055e-06,2.372510354112305,0,46.696943,6486.324482,33,2.206376
2,129,9.35e-07,2.679409268283937,1,4662.802711,4341.720982,26,2.283983
3,246,4.5142e-07,1.5383810652543626,0,5287.586959,7487.630415,27,2.254954
4,247,9.9224e-07,1.54482587030411,0,4018.269810,7184.962137,25,2.252107
...,...,...,...,...,...,...,...,...
105,5513,1.1019e-07,2.2607774403273484,0,341.733049,3637.379818,25,2.225537
106,5514,1.1579e-07,2.2542079071703363,0,292.091011,4306.456010,23,2.192877
107,6469,2.9402e-07,1.8039887002309491,0,133.635944,5528.125035,23,2.379101
108,9370,1.23951e-06,1.766042731965559,0,164.687448,10338.352359,31,2.328584


In [9]:
features = ['Shape_Area', 'C_j', 'population_density', 'visit_count', 'poi_entropy_500m','accessibility_label']
X = merged_df [features]
y = merged_df ['accessibility_label']
X

Unnamed: 0,Shape_Area,C_j,population_density,visit_count,poi_entropy_500m,accessibility_label
0,9.85462e-06,198.444303,4323.209232,38,2.144317,0
1,6.92055e-06,46.696943,6486.324482,33,2.206376,0
2,9.35e-07,4662.802711,4341.720982,26,2.283983,1
3,4.5142e-07,5287.586959,7487.630415,27,2.254954,0
4,9.9224e-07,4018.269810,7184.962137,25,2.252107,0
...,...,...,...,...,...,...
105,1.1019e-07,341.733049,3637.379818,25,2.225537,0
106,1.1579e-07,292.091011,4306.456010,23,2.192877,0
107,2.9402e-07,133.635944,5528.125035,23,2.379101,0
108,1.23951e-06,164.687448,10338.352359,31,2.328584,0


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score
from sklearn.utils.class_weight import compute_class_weight

# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 计算类别权重
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# 创建SVM分类器
svm_classifier = SVC(random_state=42, class_weight=class_weight_dict)

# 定义参数网格（降低C值和gamma值的范围）
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

# 使用StratifiedKFold进行交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 创建自定义的F1评分器
f1_scorer = make_scorer(f1_score, average='weighted')

# 使用网格搜索进行超参数调优
grid_search = GridSearchCV(svm_classifier, param_grid, cv=cv, scoring=f1_scorer, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# 获取最佳模型
best_svm = grid_search.best_estimator_

# 在测试集上进行预测
y_pred = best_svm.predict(X_test_scaled)

# 打印分类报告
print(classification_report(y_test, y_pred))

# 打印混淆矩阵
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 打印最佳参数
print("Best parameters:", grid_search.best_params_)

# 打印最佳交叉验证得分
print("Best cross-validation score:", grid_search.best_score_)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # 标准化特征
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # 创建SVM分类器
# svm_classifier = SVC(random_state=42)

# # 定义参数网格
# param_grid = {
#     'C': [0.1, 1, 10, 100],
#     'gamma': ['scale', 'auto', 0.1, 1],
#     'kernel': ['rbf', 'poly', 'sigmoid']
# }

# # 使用网格搜索进行超参数调优
# grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, scoring='f1')
# grid_search.fit(X_train_scaled, y_train)

# # 获取最佳模型
# best_svm = grid_search.best_estimator_

# # 在测试集上进行预测
# y_pred = best_svm.predict(X_test_scaled)

# # 打印分类报告
# print(classification_report(y_test, y_pred))

# # 打印混淆矩阵
# print("Confusion Matrix:")
# print(confusion_matrix(y_test, y_pred))

# # 打印最佳参数
# print("Best parameters:", grid_search.best_params_)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        10

    accuracy                           1.00        22
   macro avg       1.00      1.00      1.00        22
weighted avg       1.00      1.00      1.00        22

Confusion Matrix:
[[12  0]
 [ 0 10]]
Best parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'sigmoid'}
Best cross-validation score: 1.0


In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score

In [15]:
def train_and_evaluate(model, param_grid, X_train, y_train, X_test, y_test):
    # 使用StratifiedKFold进行交叉验证
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # 创建自定义的F1评分器
    f1_scorer = make_scorer(f1_score, average='weighted')

    # 使用网格搜索进行超参数调优
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring=f1_scorer, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # 获取最佳模型
    best_model = grid_search.best_estimator_

    # 在测试集上进行预测
    y_pred = best_model.predict(X_test)

    # 打印结果
    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation score:", grid_search.best_score_)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    return best_model

In [16]:
# 假设X和y已经准备好，并且已经进行了训练集和测试集的分割

# 标准化特征
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 随机森林
rf_model = RandomForestClassifier(random_state=42)
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

print("Random Forest Results:")
best_rf = train_and_evaluate(rf_model, rf_param_grid, X_train_scaled, y_train, X_test_scaled, y_test)

# 梯度提升树
gb_model = GradientBoostingClassifier(random_state=42)
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

print("\nGradient Boosting Results:")
best_gb = train_and_evaluate(gb_model, gb_param_grid, X_train_scaled, y_train, X_test_scaled, y_test)

Random Forest Results:
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation score: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        10

    accuracy                           1.00        22
   macro avg       1.00      1.00      1.00        22
weighted avg       1.00      1.00      1.00        22

Confusion Matrix:
[[12  0]
 [ 0 10]]

Gradient Boosting Results:
Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation score: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        10

    accuracy                           1.00        22
   macro avg       1.00

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as functionl
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class MLP(nn.Module):

  def __init__(self, hidden_dim):
      super().__init__()
      # self.n_layer = n_layer
      self.hidden_dim = hidden_dim
      self.models = nn.Sequential(
          nn.Linear(self.hidden_dim, 256),
          nn.ReLU(),
          nn.Linear(256,128),
          nn.ReLU(),
          nn.Linear(128,2),
          nn.Sigmoid()
      )

  def forward(self,input):
    x = self.models(input)
    return x

class parkDataset(Dataset):
    def __init__(self, data,transform=None, target_transform=None):
        self.dataset = data
        # breakpoint()
        self.x = self.dataset[self.dataset.columns[[1,4,5,6,7]]]
        self.labels = self.dataset['accessibility_label']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = torch.from_numpy(self.x.iloc[idx].values)
        label = self.labels.iloc[idx]
        # breakpoint()

        return x.float(), torch.tensor(label, dtype=torch.float32)

In [None]:
merged_df["Shape_Area"] = merged_df["Shape_Area"].astype(float)

In [None]:
learning_rate = 1e-3
batch_size = 64
loss_fn = nn.CrossEntropyLoss()
# breakpoint()
data = merged_df
training_data = data.iloc[:90,:]
testing_data = data.iloc[90:,:]
training_dataset = parkDataset(training_data)
testing_dataset = parkDataset(testing_data)
train_dataloader = DataLoader(training_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(testing_dataset, batch_size=8, shuffle=True)

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y.long())

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y.long()).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            # breakpoint()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
loss_fn = nn.CrossEntropyLoss()
model = MLP(5)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

epochs = 10000
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
-------------------------------
loss: 0.938262  [    8/   90]
Test Error: 
 Accuracy: 50.0%, Avg loss: 0.813262 

Epoch 1722
-------------------------------
loss: 1.188262  [    8/   90]
Test Error: 
 Accuracy: 50.0%, Avg loss: 0.813262 

Epoch 1723
-------------------------------
loss: 0.688262  [    8/   90]
Test Error: 
 Accuracy: 50.0%, Avg loss: 0.813262 

Epoch 1724
-------------------------------
loss: 0.688262  [    8/   90]
Test Error: 
 Accuracy: 50.0%, Avg loss: 0.896595 

Epoch 1725
-------------------------------
loss: 1.188262  [    8/   90]
Test Error: 
 Accuracy: 50.0%, Avg loss: 0.813262 

Epoch 1726
-------------------------------
loss: 0.938262  [    8/   90]
Test Error: 
 Accuracy: 50.0%, Avg loss: 0.813262 

Epoch 1727
-------------------------------
loss: 0.813262  [    8/   90]
Test Error: 
 Accuracy: 50.0%, Avg loss: 0.813262 

Epoch 1728
-------------------------------
loss: 1.313262  [    8/   90

KeyboardInterrupt: 