In [79]:
# 필수 라이브러리 불러오기
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import statsmodels.api as sm

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

## 데이터 불러오고 합치기.

In [69]:
# train과 test 데이터
train = pd.read_csv('../train_data.csv')
test = pd.read_csv('../test_data.csv')

# 좌표와 고저 데이터
loc_data = pd.read_csv('../aux_data_StationLocation_editver.csv', encoding='cp949')

In [70]:
ad1 = set(train['address'].unique())
ad2 = set(loc_data['address'].unique())
ad1.difference(ad2)

{'서울 서대문구 세검정로4길 32 (홍제3동 주민센터) ', '서울 성북구 돈암동 8-164번지 '}

In [71]:
train = train.replace({'서울 서대문구 세검정로4길 32 (홍제3동 주민센터) ' : '서울 서대문구 세검정로4길 32 (홍제3동 주민센터)', 
                       '서울 성북구 돈암동 8-164번지 ' : '서울 성북구 돈암동 8-164번지'})
test = test.replace({'서울 서대문구 세검정로4길 32 (홍제3동 주민센터) ' : '서울 서대문구 세검정로4길 32 (홍제3동 주민센터)', 
                     '서울 성북구 돈암동 8-164번지 ' : '서울 성북구 돈암동 8-164번지'})

In [72]:
train

Unnamed: 0,ID,year,month,location,no2,o3,co,so2,pm10,pm2.5,address,Y
0,1,2019,1,강남구,0.037032,0.009387,0.712903,0.006323,61.774194,44.161290,서울 강남구 학동로 426 강남구청 별관 1동,3
1,2,2019,1,강남대로,0.054645,0.007677,0.980645,0.005484,71.677419,40.322581,서울 서초구 강남대로 201 서초구민회관 앞 중앙차로 (양재동),3
2,3,2019,1,강동구,0.044645,0.009516,0.709677,0.005065,65.161290,41.612903,서울 강동구 구천면로 42길 59 천호1동 주민센터,3
3,4,2019,1,강변북로,0.055290,0.007548,0.638710,0.004677,71.290323,40.483871,서울 성동구 강변북로 257 한강사업본부 옆,3
4,5,2019,1,강북구,0.030645,0.017452,0.700000,0.004000,71.387097,39.387097,서울 강북구 삼양로 139길 49 우이동 주민센터,3
...,...,...,...,...,...,...,...,...,...,...,...,...
2167,2168,2022,12,한강대로,0.031645,0.014355,0.596774,0.003645,42.741935,16.677419,서울 용산구 한강대로 405 (서울역 앞),2
2168,2169,2022,12,항동,0.015645,0.017355,0.525806,0.003742,31.225806,12.935484,서울 구로구 연동로 240 (푸른수목원 내),2
2169,2170,2022,12,행주,0.021968,0.016548,0.429032,0.003774,37.709677,17.322581,김포시 고촌읍 은행영사정로 197 (신곡수중보 관리사무소),2
2170,2171,2022,12,홍릉로,0.030806,0.011226,0.529032,0.003355,38.096774,21.548387,서울 동대문구 홍릉로 1 (청량리전철역 사거리 SC제일은행 앞),2


In [73]:
loc_data

Unnamed: 0,location,address,admin_region,x,y,height,height_floor
0,강남구,서울 강남구 학동로 426 강남구청 별관 1동,삼성2동,37.517968,127.04706,29,
1,강남대로,서울 서초구 강남대로 201 서초구민회관 앞 중앙차로 (양재동),양재1동,37.481828,127.035957,28,
2,강동구,서울 강동구 구천면로 42길 59 천호1동 주민센터,천호1동,37.545004,127.13682,27,
3,강변북로,서울 성동구 강변북로 257 한강사업본부 옆,성수동,37.538962,127.041604,16,
4,강북구,서울 강북구 삼양로 139길 49 우이동 주민센터,수유제1동,37.647934,127.01187,43,
5,강서구,서울 강서구 강서로 45 다길 71 화곡3동 푸른들청소년도서관,화곡제3동,37.54467,126.835179,20,
6,공항대로,서울 강서구 마곡동 727-1091 마곡역 중앙차로정류장 옆,가양제1동,37.559649,126.828791,9,
7,관악구,서울 관악구 신림동길 14 신림동 주민센터,신림동,37.487396,126.927114,20,
8,관악산,과천시 자하동길 64 (관악산 중계소),중앙동,37.441127,126.964447,588,
9,광진구,서울 광진구 광나루로 571 구의 아리수정수센터,구의제2동,37.545947,127.092951,40,


In [74]:
train = pd.merge(left = train, right = loc_data[['address', 'x', 'y', 'height']], how = 'left', on = 'address')
test = pd.merge(left = test, right = loc_data[['address', 'x', 'y', 'height']], how = 'left', on = 'address')

In [75]:
# 잘합쳐졌는지 확인.
train

Unnamed: 0,ID,year,month,location,no2,o3,co,so2,pm10,pm2.5,address,Y,x,y,height
0,1,2019,1,강남구,0.037032,0.009387,0.712903,0.006323,61.774194,44.161290,서울 강남구 학동로 426 강남구청 별관 1동,3,37.517968,127.047060,29
1,2,2019,1,강남대로,0.054645,0.007677,0.980645,0.005484,71.677419,40.322581,서울 서초구 강남대로 201 서초구민회관 앞 중앙차로 (양재동),3,37.481828,127.035957,28
2,3,2019,1,강동구,0.044645,0.009516,0.709677,0.005065,65.161290,41.612903,서울 강동구 구천면로 42길 59 천호1동 주민센터,3,37.545004,127.136820,27
3,4,2019,1,강변북로,0.055290,0.007548,0.638710,0.004677,71.290323,40.483871,서울 성동구 강변북로 257 한강사업본부 옆,3,37.538962,127.041604,16
4,5,2019,1,강북구,0.030645,0.017452,0.700000,0.004000,71.387097,39.387097,서울 강북구 삼양로 139길 49 우이동 주민센터,3,37.647934,127.011870,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2167,2168,2022,12,한강대로,0.031645,0.014355,0.596774,0.003645,42.741935,16.677419,서울 용산구 한강대로 405 (서울역 앞),2,37.554838,126.971733,26
2168,2169,2022,12,항동,0.015645,0.017355,0.525806,0.003742,31.225806,12.935484,서울 구로구 연동로 240 (푸른수목원 내),2,37.481968,126.823669,24
2169,2170,2022,12,행주,0.021968,0.016548,0.429032,0.003774,37.709677,17.322581,김포시 고촌읍 은행영사정로 197 (신곡수중보 관리사무소),2,37.609087,126.786320,19
2170,2171,2022,12,홍릉로,0.030806,0.011226,0.529032,0.003355,38.096774,21.548387,서울 동대문구 홍릉로 1 (청량리전철역 사거리 SC제일은행 앞),2,37.580453,127.044368,14


In [68]:
test

Unnamed: 0,ID,year,month,location,no2,o3,co,so2,pm10,pm2.5,address,x_x,y_x,height_x,x_y,y_y,height_y,x,y,height
0,1,2023,1,강남구,0.030742,0.016665,0.538387,0.003255,46.548387,25.032258,서울 강남구 학동로 426 강남구청 별관 1동,37.517968,127.047060,29.0,37.517968,127.047060,29,37.517968,127.047060,29
1,2,2023,1,강남대로,0.034516,0.012352,0.748387,0.003581,50.548387,28.000000,서울 서초구 강남대로 201 서초구민회관 앞 중앙차로 (양재동),37.481828,127.035957,28.0,37.481828,127.035957,28,37.481828,127.035957,28
2,3,2023,1,강동구,0.024948,0.013658,0.502903,0.002639,49.483871,28.161290,서울 강동구 구천면로 42길 59 천호1동 주민센터,37.545004,127.136820,27.0,37.545004,127.136820,27,37.545004,127.136820,27
3,4,2023,1,강변북로,0.036358,0.012339,0.710000,0.002719,48.483871,28.806452,서울 성동구 강변북로 257 한강사업본부 옆,37.538962,127.041604,16.0,37.538962,127.041604,16,37.538962,127.041604,16
4,5,2023,1,강북구,0.023607,0.019165,0.613226,0.002629,48.935484,24.580645,서울 강북구 삼양로 139길 49 우이동 주민센터,37.647934,127.011870,43.0,37.647934,127.011870,43,37.647934,127.011870,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530,531,2023,12,한강대로,0.033323,0.016275,0.661935,0.003065,41.838710,23.806452,서울 용산구 한강대로 405 (서울역 앞),37.554838,126.971733,26.0,37.554838,126.971733,26,37.554838,126.971733,26
531,532,2023,12,항동,0.026090,0.020835,0.618387,0.002713,32.225806,19.774194,서울 구로구 연동로 240 (푸른수목원 내),37.481968,126.823669,24.0,37.481968,126.823669,24,37.481968,126.823669,24
532,533,2023,12,행주,0.025223,0.019752,0.631905,0.002694,34.967742,17.903226,김포시 고촌읍 은행영사정로 197 (신곡수중보 관리사무소),37.609087,126.786320,19.0,37.609087,126.786320,19,37.609087,126.786320,19
533,534,2023,12,홍릉로,0.034510,0.014148,0.651935,0.003077,36.064516,19.032258,서울 동대문구 홍릉로 1 (청량리전철역 사거리 SC제일은행 앞),37.580453,127.044368,14.0,37.580453,127.044368,14,37.580453,127.044368,14


In [32]:
# nan값이 있는지 확인.
nan_rows = train[train.isna().any(axis=1)]
nan_rows

Unnamed: 0,ID,year,month,location,no2,o3,co,so2,pm10,pm2.5,address,Y,x,y,height


제대로 합치지 않아져서 문제점이 무엇인지 확인 후에 고친다. -> 완료

## Logistic

In [43]:
var1 = ['no2', 'o3', 'co', 'so2', 'pm10', 'pm2.5']
var2 = ['year','month','no2', 'o3', 'co', 'so2', 'pm10', 'pm2.5']
var3 = ['year','month','no2', 'o3', 'co', 'so2', 'pm10', 'pm2.5', 'x', 'y', 'height']

In [37]:
# 특징 변수(X)와 타겟 변수(Y) 분리
X = train[var1]
y = train['Y']

# 데이터셋을 학습 세트와 테스트 세트로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 로지스틱 회귀 모델 정의 및 학습
model1 = LogisticRegression()
model1.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model1.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 결과 출력
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8344827586206897
Confusion Matrix:
[[ 24  45   0]
 [ 26 321   0]
 [  0   1  18]]
Classification Report:
              precision    recall  f1-score   support

           1       0.48      0.35      0.40        69
           2       0.87      0.93      0.90       347
           3       1.00      0.95      0.97        19

    accuracy                           0.83       435
   macro avg       0.78      0.74      0.76       435
weighted avg       0.82      0.83      0.82       435



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
# 특징 변수(X)와 타겟 변수(Y) 분리
X = train[var2]
y = train['Y']

# 데이터셋을 학습 세트와 테스트 세트로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 로지스틱 회귀 모델 정의 및 학습
model2 = LogisticRegression()
model2.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model2.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 결과 출력
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8482758620689655
Confusion Matrix:
[[ 29  40   0]
 [ 25 322   0]
 [  0   1  18]]
Classification Report:
              precision    recall  f1-score   support

           1       0.54      0.42      0.47        69
           2       0.89      0.93      0.91       347
           3       1.00      0.95      0.97        19

    accuracy                           0.85       435
   macro avg       0.81      0.77      0.78       435
weighted avg       0.84      0.85      0.84       435



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
# 특징 변수(X)와 타겟 변수(Y) 분리
X = train[var3]
y = train['Y']

# 데이터셋을 학습 세트와 테스트 세트로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 로지스틱 회귀 모델 정의 및 학습
model3 = LogisticRegression()
model3.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model3.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 결과 출력
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.871264367816092
Confusion Matrix:
[[ 35  34   0]
 [ 21 326   0]
 [  0   1  18]]
Classification Report:
              precision    recall  f1-score   support

           1       0.62      0.51      0.56        69
           2       0.90      0.94      0.92       347
           3       1.00      0.95      0.97        19

    accuracy                           0.87       435
   macro avg       0.84      0.80      0.82       435
weighted avg       0.86      0.87      0.87       435



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Scaler도 적용

In [55]:
# 특징 변수(X)와 타겟 변수(Y) 분리
X = train[var1]
y = train['Y']

# 데이터셋을 학습 세트와 테스트 세트로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화 (표준화)
Standard_Scaler = StandardScaler()
X_train = Standard_Scaler.fit_transform(X_train)
X_test = Standard_Scaler.transform(X_test)

# 로지스틱 회귀 모델 정의 및 학습
model = LogisticRegression()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 결과 출력
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8850574712643678
Confusion Matrix:
[[ 40  29   0]
 [ 18 328   1]
 [  0   2  17]]
Classification Report:
              precision    recall  f1-score   support

           1       0.69      0.58      0.63        69
           2       0.91      0.95      0.93       347
           3       0.94      0.89      0.92        19

    accuracy                           0.89       435
   macro avg       0.85      0.81      0.83       435
weighted avg       0.88      0.89      0.88       435



In [56]:
# 특징 변수(X)와 타겟 변수(Y) 분리
X = train[var2]
y = train['Y']

# 데이터셋을 학습 세트와 테스트 세트로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화 (표준화)
Standard_Scaler = StandardScaler()
X_train = Standard_Scaler.fit_transform(X_train)
X_test = Standard_Scaler.transform(X_test)

# 로지스틱 회귀 모델 정의 및 학습
model = LogisticRegression()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 결과 출력
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8896551724137931
Confusion Matrix:
[[ 40  29   0]
 [ 17 329   1]
 [  0   1  18]]
Classification Report:
              precision    recall  f1-score   support

           1       0.70      0.58      0.63        69
           2       0.92      0.95      0.93       347
           3       0.95      0.95      0.95        19

    accuracy                           0.89       435
   macro avg       0.86      0.83      0.84       435
weighted avg       0.88      0.89      0.89       435



In [76]:
# 특징 변수(X)와 타겟 변수(Y) 분리
X = train[var3]
y = train['Y']

# 데이터셋을 학습 세트와 테스트 세트로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화 (표준화)
Standard_Scaler = StandardScaler()
X_train = Standard_Scaler.fit_transform(X_train)
X_test = Standard_Scaler.transform(X_test)

# 로지스틱 회귀 모델 정의 및 학습
model = LogisticRegression()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 결과 출력
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.896551724137931
Confusion Matrix:
[[ 42  27   0]
 [ 16 330   1]
 [  0   1  18]]
Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.61      0.66        69
           2       0.92      0.95      0.94       347
           3       0.95      0.95      0.95        19

    accuracy                           0.90       435
   macro avg       0.86      0.84      0.85       435
weighted avg       0.89      0.90      0.89       435



In [58]:
# 특징 변수(X)와 타겟 변수(Y) 분리
X = train[var3]
y = train['Y']

# 데이터셋을 학습 세트와 테스트 세트로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화 (표준화)
Robust_Scaler = RobustScaler()
X_train = Robust_Scaler.fit_transform(X_train)
X_test = Robust_Scaler.transform(X_test)

# 로지스틱 회귀 모델 정의 및 학습
model = LogisticRegression()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 결과 출력
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8919540229885058
Confusion Matrix:
[[ 42  27   0]
 [ 16 330   1]
 [  0   3  16]]
Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.61      0.66        69
           2       0.92      0.95      0.93       347
           3       0.94      0.84      0.89        19

    accuracy                           0.89       435
   macro avg       0.86      0.80      0.83       435
weighted avg       0.89      0.89      0.89       435



In [59]:
# 특징 변수(X)와 타겟 변수(Y) 분리
X = train[var3]
y = train['Y']

# 데이터셋을 학습 세트와 테스트 세트로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화 (표준화)
MinMax_Scaler = MinMaxScaler()
X_train = MinMax_Scaler.fit_transform(X_train)
X_test = MinMax_Scaler.transform(X_test)

# 로지스틱 회귀 모델 정의 및 학습
model = LogisticRegression()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 결과 출력
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.871264367816092
Confusion Matrix:
[[ 31  38   0]
 [ 12 334   1]
 [  0   5  14]]
Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.45      0.55        69
           2       0.89      0.96      0.92       347
           3       0.93      0.74      0.82        19

    accuracy                           0.87       435
   macro avg       0.85      0.72      0.77       435
weighted avg       0.86      0.87      0.86       435



In [77]:
test_data = Standard_Scaler.transform(test[['year','month','no2', 'o3', 'co', 'so2', 'pm10', 'pm2.5', 'x', 'y', 'height']])
test_pred = model.predict(test_data)

test_pred

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [78]:
df = pd.DataFrame(test_pred)
df.to_csv('10.26_logistic.csv')

In [80]:
# 위 결과가 var3로 standard scaler를 사용해서 얻은 회귀계수의 결과물.

# 회귀 계수
coefficients = model.coef_[0]
print("Coefficients:", coefficients)

# 오즈 비
odds_ratios = np.exp(coefficients)
print("Odds Ratios:", odds_ratios)

Coefficients: [ 0.33669286  0.12962152 -0.72415266 -1.61596406  0.06531315 -0.12525467
 -1.78615279 -2.90759414 -0.08516181  0.09103722 -0.34535773]
Odds Ratios: [1.40030891 1.13839744 0.48473513 0.19869902 1.06749326 0.88227219
 0.16760374 0.05460695 0.91836367 1.09530977 0.70796705]


In [None]:
# 특징 변수(X)와 타겟 변수(Y) 분리
X = train[var3]
y = train['Y']

# 데이터셋을 학습 세트와 테스트 세트로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화 (표준화)
MinMax_Scaler = MinMaxScaler()
X_train = MinMax_Scaler.fit_transform(X_train)
X_test = MinMax_Scaler.transform(X_test)

# 로지스틱 회귀 모델 정의 및 학습
model = LogisticRegression()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 결과 출력
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

In [85]:
# 월자료가 순서가 없을 것이라 생각되어 변경.
train = pd.get_dummies(data = train, columns = ['month'], dtype = int)
train

Unnamed: 0,ID,year,location,no2,o3,co,so2,pm10,pm2.5,address,Y,x,y,height,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,1,2019,강남구,0.037032,0.009387,0.712903,0.006323,61.774194,44.161290,서울 강남구 학동로 426 강남구청 별관 1동,3,37.517968,127.047060,29,1,0,0,0,0,0,0,0,0,0,0,0
1,2,2019,강남대로,0.054645,0.007677,0.980645,0.005484,71.677419,40.322581,서울 서초구 강남대로 201 서초구민회관 앞 중앙차로 (양재동),3,37.481828,127.035957,28,1,0,0,0,0,0,0,0,0,0,0,0
2,3,2019,강동구,0.044645,0.009516,0.709677,0.005065,65.161290,41.612903,서울 강동구 구천면로 42길 59 천호1동 주민센터,3,37.545004,127.136820,27,1,0,0,0,0,0,0,0,0,0,0,0
3,4,2019,강변북로,0.055290,0.007548,0.638710,0.004677,71.290323,40.483871,서울 성동구 강변북로 257 한강사업본부 옆,3,37.538962,127.041604,16,1,0,0,0,0,0,0,0,0,0,0,0
4,5,2019,강북구,0.030645,0.017452,0.700000,0.004000,71.387097,39.387097,서울 강북구 삼양로 139길 49 우이동 주민센터,3,37.647934,127.011870,43,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2167,2168,2022,한강대로,0.031645,0.014355,0.596774,0.003645,42.741935,16.677419,서울 용산구 한강대로 405 (서울역 앞),2,37.554838,126.971733,26,0,0,0,0,0,0,0,0,0,0,0,1
2168,2169,2022,항동,0.015645,0.017355,0.525806,0.003742,31.225806,12.935484,서울 구로구 연동로 240 (푸른수목원 내),2,37.481968,126.823669,24,0,0,0,0,0,0,0,0,0,0,0,1
2169,2170,2022,행주,0.021968,0.016548,0.429032,0.003774,37.709677,17.322581,김포시 고촌읍 은행영사정로 197 (신곡수중보 관리사무소),2,37.609087,126.786320,19,0,0,0,0,0,0,0,0,0,0,0,1
2170,2171,2022,홍릉로,0.030806,0.011226,0.529032,0.003355,38.096774,21.548387,서울 동대문구 홍릉로 1 (청량리전철역 사거리 SC제일은행 앞),2,37.580453,127.044368,14,0,0,0,0,0,0,0,0,0,0,0,1


In [87]:
# 월자료가 순서가 없을 것이라 생각되어 변경.
test = pd.get_dummies(data = test, columns = ['month'], dtype = int)
test

Unnamed: 0,ID,year,location,no2,o3,co,so2,pm10,pm2.5,address,x,y,height,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,1,2023,강남구,0.030742,0.016665,0.538387,0.003255,46.548387,25.032258,서울 강남구 학동로 426 강남구청 별관 1동,37.517968,127.047060,29,1,0,0,0,0,0,0,0,0,0,0,0
1,2,2023,강남대로,0.034516,0.012352,0.748387,0.003581,50.548387,28.000000,서울 서초구 강남대로 201 서초구민회관 앞 중앙차로 (양재동),37.481828,127.035957,28,1,0,0,0,0,0,0,0,0,0,0,0
2,3,2023,강동구,0.024948,0.013658,0.502903,0.002639,49.483871,28.161290,서울 강동구 구천면로 42길 59 천호1동 주민센터,37.545004,127.136820,27,1,0,0,0,0,0,0,0,0,0,0,0
3,4,2023,강변북로,0.036358,0.012339,0.710000,0.002719,48.483871,28.806452,서울 성동구 강변북로 257 한강사업본부 옆,37.538962,127.041604,16,1,0,0,0,0,0,0,0,0,0,0,0
4,5,2023,강북구,0.023607,0.019165,0.613226,0.002629,48.935484,24.580645,서울 강북구 삼양로 139길 49 우이동 주민센터,37.647934,127.011870,43,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530,531,2023,한강대로,0.033323,0.016275,0.661935,0.003065,41.838710,23.806452,서울 용산구 한강대로 405 (서울역 앞),37.554838,126.971733,26,0,0,0,0,0,0,0,0,0,0,0,1
531,532,2023,항동,0.026090,0.020835,0.618387,0.002713,32.225806,19.774194,서울 구로구 연동로 240 (푸른수목원 내),37.481968,126.823669,24,0,0,0,0,0,0,0,0,0,0,0,1
532,533,2023,행주,0.025223,0.019752,0.631905,0.002694,34.967742,17.903226,김포시 고촌읍 은행영사정로 197 (신곡수중보 관리사무소),37.609087,126.786320,19,0,0,0,0,0,0,0,0,0,0,0,1
533,534,2023,홍릉로,0.034510,0.014148,0.651935,0.003077,36.064516,19.032258,서울 동대문구 홍릉로 1 (청량리전철역 사거리 SC제일은행 앞),37.580453,127.044368,14,0,0,0,0,0,0,0,0,0,0,0,1


In [86]:
train.columns

Index(['ID', 'year', 'location', 'no2', 'o3', 'co', 'so2', 'pm10', 'pm2.5',
       'address', 'Y', 'x', 'y', 'height', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10', 'month_11', 'month_12'],
      dtype='object')

In [88]:
var4 =  ['year', 'no2', 'o3', 'co', 'so2', 'pm10', 'pm2.5', 'x', 'y', 'height', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12']

In [89]:
# 특징 변수(X)와 타겟 변수(Y) 분리
X = train[var4]
y = train['Y']

# 데이터셋을 학습 세트와 테스트 세트로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화 (표준화)
Standard_Scaler = StandardScaler()
X_train = Standard_Scaler.fit_transform(X_train)
X_test = Standard_Scaler.transform(X_test)

# 로지스틱 회귀 모델 정의 및 학습
model = LogisticRegression()
model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 결과 출력
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8988505747126436
Confusion Matrix:
[[ 46  23   0]
 [ 18 328   1]
 [  0   2  17]]
Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.67      0.69        69
           2       0.93      0.95      0.94       347
           3       0.94      0.89      0.92        19

    accuracy                           0.90       435
   macro avg       0.86      0.84      0.85       435
weighted avg       0.90      0.90      0.90       435



In [99]:
# Essentials
import numpy as np
import pandas as pd
import random
import time
import gc
import os
from datetime import datetime

# Plots
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
figure(num=None, figsize=(20, 10), dpi=80, facecolor='w', edgecolor='k')
import seaborn as sns
# matplotlib 의 기본 scheme 말고
# seaborn scheme 을 세팅하여, 일일이 graph의 font size 지정할 필요 없이
# seaborn 의 font_scale 을 사용하는 것을 추천드립니다.
plt.style.use('seaborn-v0_8') # 이 위에 코드를 실행하면 여러 테마가 있으니 마음껏 바꾸셔도 됩니다.
sns.set(font_scale=2.5)
from matplotlib.pylab import rcParams
##set up the parameters
rcParams['figure.figsize'] = 80,60

# Models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from ngboost import NGBClassifier
import lightgbm
import xgboost as xgb
import catboost
from xgboost import plot_tree

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax


# Misc
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score
from dateutil import tz

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000
pd.options.display.max_columns = 400
pd.set_option('display.max_colwidth', None)

<Figure size 1600x800 with 0 Axes>

In [111]:
def show_time(diff):
    m, s = divmod(diff, 60)
    h, m = divmod(m, 60)
    s, m, h = int(round(s, 0)), int(round(m, 0)), int(round(h, 0))
    print("Execution Time: " + "{0:02d}:{1:02d}:{2:02d}".format(h, m, s))

def Train(clf, X_train, y_train, X_test, y_test, task_type='classification'):
    # Train
    start = time.time()
    model = clf.fit(X_train, y_train)
    end = time.time()
    training_time = end - start
    print('Training time: ')
    show_time(training_time)

    # Predict
    start = time.time()
    y_pred = model.predict(X_test)
    end = time.time()
    prediction_time = end - start
    print('\nPrediction time: ')
    show_time(prediction_time)

    # Calculate score
    if task_type == 'classification':
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')  # for multi-class
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")
        score = accuracy
    else:  # regression
        mse = mean_squared_error(y_test, y_pred)
        print(f"Mean Squared Error: {mse:.4f}")
        score = mse

    return model#, score, training_time, prediction_time

def plot_metric(model_scores, score='Accuracy'):
    # Set figure size
    rcParams['figure.figsize'] = 20, 15
    plt.figure()
    plt.bar(model_scores['Model'], height=model_scores[score])
    
    if score != 'Prediction Times':
        for i, v in enumerate(model_scores[score]):
            plt.text(i, v, f'{v:.4f}', ha='center', va='bottom')
    
    plt.xlabel('Model')
    plt.ylabel(score)
    plt.title(f'{score} Comparison')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [114]:
# 특징 변수(X)와 타겟 변수(Y) 분리
X = train[var4]
y = train['Y']-1

# 데이터셋을 학습 세트와 테스트 세트로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화 (표준화)
Standard_Scaler = StandardScaler()
X_train = Standard_Scaler.fit_transform(X_train)
X_test = Standard_Scaler.transform(X_test)
test_data = Standard_Scaler.transform(test[var4])

In [115]:
prediction_times = []
training_times = []
scores = []


xgboost = Train(XGBClassifier(n_estimators=50, max_depth=7), X_train, y_train, X_test, y_test)
lgb = Train(LGBMClassifier(n_estimators=50, max_depth=5, num_leaves = 50), X_train, y_train, X_test, y_test)
cat = Train(CatBoostClassifier(n_estimators=50, verbose=False, max_depth=7), X_train, y_train, X_test, y_test)

Training time: 
Execution Time: 00:00:00

Prediction time: 
Execution Time: 00:00:00
Accuracy: 0.9977
F1 Score: 0.9977
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001275 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1708
[LightGBM] [Info] Number of data points in the train set: 1737, number of used features: 22
[LightGBM] [Info] Start training from score -1.966853
[LightGBM] [Info] Start training from score -0.205737
[LightGBM] [Info] Start training from score -3.077888
Training time: 
Execution Time: 00:00:00

Prediction time: 
Execution Time: 00:00:00
Accuracy: 0.9954
F1 Score: 0.9954
Training time: 
Execution Time: 00:00:00

Prediction time: 
Execution Time: 00:00:00
Accuracy: 0.9931
F1 Score: 0.9931


In [116]:
xgb_test = pd.DataFrame(xgboost.predict(test[var4]))
xgb_test.to_csv('10.27_xgb.csv')
lgb_test = pd.DataFrame(lgb.predict(test[var4]))
lgb_test.to_csv('10.27_lgb.csv')
cat_test = pd.DataFrame(cat.predict(test[var4]))
cat_test.to_csv('10.27_cat.csv')

In [105]:
# 1. 데이터 확인
print("Unique values in y_train:", np.unique(y_train))
print("Unique values in y_test:", np.unique(y_test))

# 만약 클래스 레이블이 0, 1, 2라면 다음과 같이 수정:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# 2. 모델 설정 수정
from ngboost import NGBClassifier
from ngboost.learners import default_tree_learner
from ngboost.distns import k_categorical

k = len(np.unique(y_train))  # 고유한 클래스의 수
ngb = NGBClassifier(Dist=k_categorical(k), Base=default_tree_learner)

# 3. 모델 훈련 및 예측
def Train(clf, X_train, y_train, X_test, y_test, task_type='classification'):
    # Train
    start = time.time()
    model = clf.fit(X_train, y_train)
    end = time.time()
    training_time = end - start
    print('Training time: ')
    show_time(training_time)

    # Predict
    start = time.time()
    y_pred = model.predict(X_test)
    end = time.time()
    prediction_time = end - start
    print('\nPrediction time: ')
    show_time(prediction_time)

    # Calculate score
    if task_type == 'classification':
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")
        score = accuracy
    else:  # regression
        mse = mean_squared_error(y_test, y_pred)
        print(f"Mean Squared Error: {mse:.4f}")
        score = mse

    return model#, score, training_time, prediction_time

# 사용 예시
models = [('NGBoost', ngb)]
results = []
for name, clf in models:
    model, score, train_time, pred_time = Train(clf, X_train, y_train, X_test, y_test)
    results.append((name, score, train_time, pred_time))

model_scores = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Training Time', 'Prediction Time'])
plot_metric(model_scores, 'Accuracy')

Unique values in y_train: [0 1 2]
Unique values in y_test: [0 1 2]
[iter 0] loss=0.5844 val_loss=0.0000 scale=8.0000 norm=27.3137
[iter 100] loss=0.0123 val_loss=0.0000 scale=1.0000 norm=1.0729
[iter 200] loss=0.0073 val_loss=0.0000 scale=0.5000 norm=0.5334
[iter 300] loss=0.0062 val_loss=0.0000 scale=0.1250 norm=0.1332
[iter 400] loss=0.0059 val_loss=0.0000 scale=0.0625 norm=0.0666
Training time: 
Execution Time: 00:00:04

Prediction time: 
Execution Time: 00:00:00
Accuracy: 0.9977
F1 Score: 0.9977


TypeError: cannot unpack non-iterable NGBClassifier object