In [1]:
# 필수 라이브러리 불러오기
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import statsmodels.api as sm

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [2]:
train = pd.read_csv('10.27_train_pop_loc_car_tax_rain.csv')
test = pd.read_csv('10.27_test_pop_loc_car_tax_rain.csv')

In [3]:
print(len(train[train.isna().any(axis = 1)]))
print(len(test[test.isna().any(axis = 1)]))

99
11


In [4]:
# 강수량 데이터 빈것은 0으로 채움
train = train.fillna(0)
test = test.fillna(0)

In [5]:
def create_time_dummies(train_df, test_df):
    """
    train과 test 데이터의 month와 year를 0과 1로 된 더미 변수로 변환하는 함수
    
    Parameters:
    train_df (pandas.DataFrame): 학습 데이터프레임
    test_df (pandas.DataFrame): 테스트 데이터프레임
    
    Returns:
    tuple: (변환된 학습 데이터프레임, 변환된 테스트 데이터프레임)
    """
    # 데이터프레임 복사
    train = train_df.copy()
    test = test_df.copy()
    
    # month 더미화 (int 타입으로)
    train_month_dummies = pd.get_dummies(train['month'], prefix='month', dtype=int)
    test_month_dummies = pd.get_dummies(test['month'], prefix='month', dtype=int)
    
    # year 더미화 (int 타입으로)
    train_year_dummies = pd.get_dummies(train['year'], prefix='year', dtype=int)
    test_year_dummies = pd.get_dummies(test['year'], prefix='year', dtype=int)
    
    # 원본 month, year 컬럼 삭제
    train = train.drop(['month', 'year'], axis=1)
    test = test.drop(['month', 'year'], axis=1)
    
    # 더미 변수 결합
    train = pd.concat([train, train_month_dummies, train_year_dummies], axis=1)
    test = pd.concat([test, test_month_dummies, test_year_dummies], axis=1)
    
    # train과 test의 더미 컬럼 동기화
    missing_cols = set(train.columns) - set(test.columns)
    for col in missing_cols:
        test[col] = 0
    
    # 컬럼 순서 맞추기
    test = test[train.columns]
    
    return train, test

# 사용 예시
# train_encoded, test_encoded = create_time_dummies(train, test)

# 사용 예시
train, test = create_time_dummies(train, test)

# 결과 확인
# print("Train shape:", train_encoded.shape)
# print("Test shape:", test_encoded.shape)
# print("\nDummy columns:", [col for col in train_encoded.columns if 'month_' in col or 'year_' in col])

In [6]:
train

Unnamed: 0.1,Unnamed: 0,ID,location,no2,o3,co,so2,pm10,pm2.5,address,Y,pop_men<10s,pop_men10s,pop_men20s,pop_men30s,pop_men40s,pop_men50s,pop_men60s,pop_men>70s,pop_women<10s,pop_women10s,pop_women20s,pop_women30s,pop_women40s,pop_women50s,pop_women60s,pop_women>70s,x,y,height,road,tree,building,river,car_sedan,car_van,car_twowheels,car_special,car_truck,gu,tax,rain_count,total_rain,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,year_2019,year_2020,year_2021,year_2022
0,0,1,강남구,0.037032,0.009387,0.712903,0.006323,61.774194,44.161290,서울 강남구 학동로 426 강남구청 별관 1동,3,927.05,602.94,921.76,1214.63,1347.42,1085.88,798.85,1243.61,904.53,570.95,1067.53,1294.37,1278.06,993.79,998.26,1822.24,37.517968,127.047060,29,1,0,1,0,211579,6337,16766,402,16506,강남구,3457856.00,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1,2,강남대로,0.054645,0.007677,0.980645,0.005484,71.677419,40.322581,서울 서초구 강남대로 201 서초구민회관 앞 중앙차로 (양재동),3,926.62,527.53,856.40,1037.30,1205.93,988.93,776.57,1258.34,1009.66,576.10,913.66,1123.37,1192.08,943.84,957.08,1899.12,37.481828,127.035957,28,1,1,1,0,159548,5206,13006,345,17140,서초구,2010972.00,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,2,3,강동구,0.044645,0.009516,0.709677,0.005065,65.161290,41.612903,서울 강동구 구천면로 42길 59 천호1동 주민센터,3,1050.55,554.83,786.90,913.87,1016.95,959.38,918.19,1278.12,952.07,492.74,820.04,974.87,1050.13,1076.58,1167.34,1906.92,37.545004,127.136820,27,0,0,1,0,115009,4803,16276,288,15538,강동구,703426.00,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,3,4,강변북로,0.055290,0.007548,0.638710,0.004677,71.290323,40.483871,서울 성동구 강변북로 257 한강사업본부 옆,3,715.98,308.00,939.68,887.38,800.37,683.94,600.66,837.32,639.90,285.11,1028.09,940.74,730.54,690.30,741.79,1422.25,37.538962,127.041604,16,1,1,1,1,90341,4029,20522,178,11194,성동구,582273.00,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,4,5,강북구,0.030645,0.017452,0.700000,0.004000,71.387097,39.387097,서울 강북구 삼양로 139길 49 우이동 주민센터,3,794.49,437.98,809.63,770.49,899.03,851.66,797.36,1481.04,732.07,402.66,981.70,778.18,888.32,989.71,1032.93,2411.36,37.647934,127.011870,43,0,1,1,0,63859,4032,21754,156,9371,강북구,234701.00,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2167,2167,2168,한강대로,0.031645,0.014355,0.596774,0.003645,42.741935,16.677419,서울 용산구 한강대로 405 (서울역 앞),2,468.63,228.55,500.54,721.55,721.81,608.52,425.67,576.01,448.59,217.23,626.92,792.36,666.91,584.50,485.36,980.87,37.554838,126.971733,26,1,0,1,0,66976,2176,15525,155,6198,용산구,1239146.60,7.0,14.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2168,2168,2169,항동,0.015645,0.017355,0.525806,0.003742,31.225806,12.935484,서울 구로구 연동로 240 (푸른수목원 내),2,912.11,402.48,631.74,1154.68,1140.08,999.59,806.17,1085.31,896.17,387.86,835.56,1122.76,964.10,936.79,918.54,1604.72,37.481968,126.823669,24,1,1,1,0,126143,5130,14950,462,16924,구로구,695321.22,8.0,20.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2169,2169,2170,행주,0.021968,0.016548,0.429032,0.003774,37.709677,17.322581,김포시 고촌읍 은행영사정로 197 (신곡수중보 관리사무소),2,1057.00,1344.00,1624.00,2079.00,2740.00,2675.00,2456.00,1536.00,1002.00,1235.00,1360.00,1782.00,2149.00,2236.00,2287.00,2002.00,37.609087,126.786320,19,0,1,0,1,0,0,0,0,0,고촌읍,0.00,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2170,2170,2171,홍릉로,0.030806,0.011226,0.529032,0.003355,38.096774,21.548387,서울 동대문구 홍릉로 1 (청량리전철역 사거리 SC제일은행 앞),2,756.86,377.90,1163.09,794.58,896.45,817.16,676.90,1052.10,738.11,370.80,1796.19,832.40,844.57,796.97,762.11,1688.38,37.580453,127.044368,14,0,0,1,0,83882,3087,24240,265,12213,동대문구,535952.20,11.0,19.5,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1


In [8]:
# 수정할 열들의 리스트
columns_to_modify = [
    'pop_men<10s', 'pop_men10s', 'pop_men20s', 'pop_men30s', 
    'pop_men40s', 'pop_men50s', 'pop_men60s', 'pop_men>70s', 
    'pop_women<10s', 'pop_women10s', 'pop_women20s', 'pop_women30s',
    'pop_women40s', 'pop_women50s', 'pop_women60s', 'pop_women>70s'
]

# 'location'이 '행주'인 행들에 대해 지정된 열들의 값을 30으로 나눕니다.
train.loc[train['location'] == '행주', columns_to_modify] = train.loc[train['location'] == '행주', columns_to_modify].div(30)
test.loc[test['location'] == '행주', columns_to_modify] = test.loc[test['location'] == '행주', columns_to_modify].div(30)

In [9]:
var = [ 'month_1', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10',
       'month_11', 'month_12', 'no2', 'o3', 'co','so2', 'pm10', 'pm2.5', 
       'pop_men<10s', 'pop_men10s','pop_men20s', 'pop_men30s', 'pop_men40s', 'pop_men50s', 'pop_men60s',
       'pop_men>70s', 'pop_women<10s', 'pop_women10s', 'pop_women20s',
       'pop_women30s', 'pop_women40s', 'pop_women50s', 'pop_women60s',
       'pop_women>70s', 'x', 'y', 'height', 'road', 'tree', 'building',
       'river', 'car_sedan', 'car_van', 'car_twowheels', 'car_special',
       'car_truck', 'tax', 'rain_count', 'total_rain']
tar = ['Y']

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2172 entries, 0 to 2171
Data columns (total 59 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     2172 non-null   int64  
 1   ID             2172 non-null   int64  
 2   location       2172 non-null   object 
 3   no2            2172 non-null   float64
 4   o3             2172 non-null   float64
 5   co             2172 non-null   float64
 6   so2            2172 non-null   float64
 7   pm10           2172 non-null   float64
 8   pm2.5          2172 non-null   float64
 9   address        2172 non-null   object 
 10  Y              2172 non-null   int64  
 11  pop_men<10s    2172 non-null   float64
 12  pop_men10s     2172 non-null   float64
 13  pop_men20s     2172 non-null   float64
 14  pop_men30s     2172 non-null   float64
 15  pop_men40s     2172 non-null   float64
 16  pop_men50s     2172 non-null   float64
 17  pop_men60s     2172 non-null   float64
 18  pop_men>

In [22]:
# 기본 변수 세트
base_vars = ['month_1', 'month_2', 'month_3', 'month_4','month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10','month_11', 'month_12', 'no2', 'o3', 'co', 'so2', 'pm10', 'pm2.5']
# base_vars = ['no2', 'o3', 'co', 'so2', 'pm10', 'pm2.5']

# 추가 변수 그룹
additional_vars = [
    ['pop_men<10s', 'pop_men10s', 'pop_men20s', 'pop_men30s', 'pop_men40s', 'pop_men50s', 'pop_men60s', 'pop_men>70s','pop_women<10s', 'pop_women10s', 'pop_women20s', 'pop_women30s', 'pop_women40s', 'pop_women50s', 'pop_women60s', 'pop_women>70s'],
    ['x', 'y', 'height'],
    ['road', 'tree', 'building', 'river'],
    ['car_sedan', 'car_van', 'car_twowheels', 'car_special', 'car_truck'],
    ['tax'], 
    ['rain_count', 'total_rain']
]

def evaluate_model(features):
    X = train[features]
    y = train['Y']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    Standard_Scaler = StandardScaler()
    X_train = Standard_Scaler.fit_transform(X_train)
    X_test = Standard_Scaler.transform(X_test)
    
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return report, conf_matrix

# 모델 평가 및 결과 저장
results = []

# 기본 변수만으로 평가
base_report, base_conf_matrix = evaluate_model(base_vars)
results.append({
    'features': 'Base variables',
    'accuracy': base_report['accuracy'],
    'macro_avg_f1': base_report['macro avg']['f1-score']
})

# 추가 변수 그룹을 하나씩 추가하며 평가
for i, add_vars in enumerate(additional_vars, 1):
    features = base_vars + add_vars
    report, conf_matrix = evaluate_model(features)
    results.append({
        'features': f'Base + Group {i}',
        'accuracy': report['accuracy'],
        'macro_avg_f1': report['macro avg']['f1-score']
    })

# 모든 변수를 사용한 평가
all_vars = base_vars + [v for group in additional_vars for v in group]
all_report, all_conf_matrix = evaluate_model(all_vars)
results.append({
    'features': 'All variables',
    'accuracy': all_report['accuracy'],
    'macro_avg_f1': all_report['macro avg']['f1-score']
})

# 결과 출력
for result in results:
    print(f"Features: {result['features']}")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print(f"Macro Avg F1-score: {result['macro_avg_f1']:.4f}")
    print()

Features: Base variables
Accuracy: 0.8851
Macro Avg F1-score: 0.8260

Features: Base + Group 1
Accuracy: 0.8920
Macro Avg F1-score: 0.8439

Features: Base + Group 2
Accuracy: 0.8897
Macro Avg F1-score: 0.8257

Features: Base + Group 3
Accuracy: 0.8851
Macro Avg F1-score: 0.8260

Features: Base + Group 4
Accuracy: 0.8897
Macro Avg F1-score: 0.8399

Features: Base + Group 5
Accuracy: 0.8874
Macro Avg F1-score: 0.8300

Features: Base + Group 6
Accuracy: 0.8920
Macro Avg F1-score: 0.8279

Features: All variables
Accuracy: 0.8943
Macro Avg F1-score: 0.8497



In [49]:
import itertools
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# 기존 코드는 그대로 유지

# 모든 가능한 그룹 조합 생성 및 평가 함수
def evaluate_group_combinations(base_vars, additional_vars):
    results = []
    
    # 기본 변수만으로 평가
    base_report, base_conf_matrix = evaluate_model(base_vars)
    results.append({
        'features': 'Base variables',
        'accuracy': base_report['accuracy'],
        'macro_avg_f1': base_report['macro avg']['f1-score']
    })
    
    # 모든 가능한 그룹 조합 생성
    for r in range(1, len(additional_vars) + 1):
        for combo in itertools.combinations(range(len(additional_vars)), r):
            features = base_vars.copy()
            group_names = []
            for i in combo:
                features.extend(additional_vars[i])
                group_names.append(str(i+1))
            
            report, conf_matrix = evaluate_model(features)
            results.append({
                'features': f"Base + Groups {', '.join(group_names)}",
                'accuracy': report['accuracy'],
                'macro_avg_f1': report['macro avg']['f1-score']
            })
    
    # 모든 변수를 사용한 평가
    all_vars = base_vars + [v for group in additional_vars for v in group]
    all_report, all_conf_matrix = evaluate_model(all_vars)
    results.append({
        'features': 'All variables',
        'accuracy': all_report['accuracy'],
        'macro_avg_f1': all_report['macro avg']['f1-score']
    })
    
    return results

# 결과 평가 및 출력
results = evaluate_group_combinations(base_vars, additional_vars)

# 결과 출력
for result in results:
    print(f"Features: {result['features']}")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print(f"Macro Avg F1-score: {result['macro_avg_f1']:.4f}")
    print()

Features: Base variables
Accuracy: 0.8943
Macro Avg F1-score: 0.8463

Features: Base + Groups 1
Accuracy: 0.9057
Macro Avg F1-score: 0.8679

Features: Base + Groups 2
Accuracy: 0.8989
Macro Avg F1-score: 0.8493

Features: Base + Groups 3
Accuracy: 0.8920
Macro Avg F1-score: 0.8427

Features: Base + Groups 4
Accuracy: 0.8943
Macro Avg F1-score: 0.8494

Features: Base + Groups 5
Accuracy: 0.8989
Macro Avg F1-score: 0.8507

Features: Base + Groups 6
Accuracy: 0.9011
Macro Avg F1-score: 0.8515

Features: Base + Groups 1, 2
Accuracy: 0.8989
Macro Avg F1-score: 0.8539

Features: Base + Groups 1, 3
Accuracy: 0.8989
Macro Avg F1-score: 0.8493

Features: Base + Groups 1, 4
Accuracy: 0.8966
Macro Avg F1-score: 0.8532

Features: Base + Groups 1, 5
Accuracy: 0.9034
Macro Avg F1-score: 0.8643

Features: Base + Groups 1, 6
Accuracy: 0.9034
Macro Avg F1-score: 0.8629

Features: Base + Groups 2, 3
Accuracy: 0.8943
Macro Avg F1-score: 0.8433

Features: Base + Groups 2, 4
Accuracy: 0.9034
Macro Avg F1-s

In [23]:
# 특징 변수(X)와 타겟 변수(Y) 분리
X = train[base_vars + additional_vars[1] + additional_vars[3] + additional_vars[5]]
y = train['Y']

# 데이터셋을 학습 세트와 테스트 세트로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Standard_Scaler = StandardScaler()
X_train = Standard_Scaler.fit_transform(X_train)
X_test = Standard_Scaler.transform(X_test)

# 로지스틱 회귀 모델 정의 및 학습
model1 = LogisticRegression()
model1.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model1.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 결과 출력
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8919540229885058
Confusion Matrix:
[[ 41  28   0]
 [ 16 330   1]
 [  0   2  17]]
Classification Report:
              precision    recall  f1-score   support

           1       0.72      0.59      0.65        69
           2       0.92      0.95      0.93       347
           3       0.94      0.89      0.92        19

    accuracy                           0.89       435
   macro avg       0.86      0.81      0.83       435
weighted avg       0.89      0.89      0.89       435



In [17]:
# 특징 변수(X)와 타겟 변수(Y) 분리
X = train[base_vars + additional_vars[1] + additional_vars[3] + additional_vars[4] + additional_vars[5]]
y = train['Y']

# 데이터셋을 학습 세트와 테스트 세트로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Standard_Scaler = StandardScaler()
X_train = Standard_Scaler.fit_transform(X_train)
X_test = Standard_Scaler.transform(X_test)

# 로지스틱 회귀 모델 정의 및 학습
model1 = LogisticRegression(max_iter = 1000)
model1.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
y_pred = model1.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# 결과 출력
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.9080459770114943
Confusion Matrix:
[[ 48  21   0]
 [ 16 330   1]
 [  0   2  17]]
Classification Report:
              precision    recall  f1-score   support

           1       0.75      0.70      0.72        69
           2       0.93      0.95      0.94       347
           3       0.94      0.89      0.92        19

    accuracy                           0.91       435
   macro avg       0.88      0.85      0.86       435
weighted avg       0.91      0.91      0.91       435



In [21]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# X는 독립변수들의 데이터프레임
X_with_constant = add_constant(X)

vif_data = pd.DataFrame()
vif_data["Variable"] = X_with_constant.columns
vif_data["VIF"] = [variance_inflation_factor(X_with_constant.values, i) for i in range(X_with_constant.shape[1])]

print(vif_data)

  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


         Variable       VIF
0           const  0.000000
1         month_1       inf
2         month_2       inf
3         month_3       inf
4         month_4       inf
5         month_5       inf
6         month_6       inf
7         month_7       inf
8         month_8       inf
9         month_9       inf
10       month_10       inf
11       month_11       inf
12       month_12       inf
13            no2  3.692971
14             o3  5.509534
15             co  2.527093
16            so2  1.785822
17           pm10  7.595816
18          pm2.5  7.557395
19              x  1.941942
20              y  1.580256
21         height  1.830537
22      car_sedan  8.349622
23        car_van  6.487287
24  car_twowheels  2.027731
25    car_special  2.151096
26      car_truck  8.327805
27            tax  3.012281
28     rain_count  3.005999
29     total_rain  1.567169


In [18]:
# 회귀 계수
coefficients = model1.coef_[0]
print("Coefficients:", coefficients)

# 오즈 비
odds_ratios = np.exp(coefficients)
print("Odds Ratios:", odds_ratios)

Coefficients: [-0.34863419 -0.13690264 -0.13783866 -0.06326992 -0.18194778  0.30150983
  0.27794034  0.23247835  0.36730292  0.38997634 -0.19377334 -0.43585785
 -0.97356885 -1.59585226  0.14473139 -0.21888975 -1.24650219 -2.63316914
 -0.0564055   0.06858579 -0.4026218   0.07731212 -0.32133832  0.01231649
  0.02068237  0.08198437  0.10237255  0.59269397 -0.02136347]
Odds Ratios: [0.70565122 0.87205513 0.87123925 0.93869006 0.83364487 1.3518984
 1.32040742 1.26172313 1.44383522 1.47694585 0.82384462 0.64670965
 0.37773256 0.20273567 1.15572908 0.80341029 0.28750869 0.0718504
 0.94515579 1.0709925  0.6685649  1.08037923 0.72517787 1.01239265
 1.02089773 1.08543884 1.1077961  1.80885486 0.97886311]


In [58]:
test_data = Standard_Scaler.fit_transform(test[base_vars + additional_vars[1] + additional_vars[3] + additional_vars[5]])
test_pred = model1.predict(test_data)
temp_df = pd.DataFrame(test_pred)
temp_df.to_csv('10.27_logistic_246_var.csv')