In [1]:
import pandas as pd

X_train = pd.read_csv("../data/연습문제/weatherAUS_X_train.csv")
X_test = pd.read_csv("../data/연습문제/weatherAUS_X_test.csv")
y_train = pd.read_csv("../data/연습문제/weatherAUS_y_train.csv")

In [2]:
### 불필요한 컬럼 삭제 

Date = X_test["Date"].copy() 

X_train = X_train.drop(columns="Date")
X_test = X_test.drop(columns="Date")
y_train = y_train.drop(columns="Date")

In [3]:
X_train.isnull().sum()
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11714 entries, 0 to 11713
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Location       11714 non-null  object 
 1   MinTemp        11658 non-null  float64
 2   MaxTemp        11662 non-null  float64
 3   Rainfall       11630 non-null  float64
 4   WindGustDir    11073 non-null  object 
 5   WindGustSpeed  11073 non-null  float64
 6   WindDir9am     10873 non-null  object 
 7   WindDir3pm     11127 non-null  object 
 8   WindSpeed9am   11633 non-null  float64
 9   WindSpeed3pm   11178 non-null  float64
 10  Humidity9am    11650 non-null  float64
 11  Humidity3pm    10970 non-null  float64
 12  Pressure9am    10503 non-null  float64
 13  Pressure3pm    10499 non-null  float64
 14  Cloud9am       6937 non-null   float64
 15  Cloud3pm       6115 non-null   float64
 16  Temp9am        11674 non-null  float64
 17  Temp3pm        10987 non-null  float64
 18  RainTo

In [4]:
### 결측치가 500개가 넘는 조건 삭제 

cond_na500 = (X_train.isnull().sum() >= 500)
col_500 = X_train.columns[cond_na500]
X_train = X_train.drop(columns=col_500)
X_test = X_test.drop(columns=col_500)

In [14]:
# 수치형만 있는 데이터 프레임 추출
X_train_conti = X_train.select_dtypes(exclude="object").copy()
X_test_conti = X_test.select_dtypes(exclude="object").copy()

# 평균값 대치
X_train_conti = X_train_conti.fillna(X_train_conti.mean())
X_test_conti = X_test_conti.fillna(X_test_conti.mean())

# 카테고리형 데이터프레임 추출
X_train_category = X_train.select_dtypes(include="object").copy()
X_test_category = X_test.select_dtypes(include="object").copy()

# 최대 라벨로 대치
# idxmax 시리즈 중 가장많이 값이나온것 찾음
mode = X_train_category.value_counts("RainToday").idxmax()


X_train_category = X_train_category.fillna(mode)
X_test_category = X_test_category.fillna(mode)

X_train = pd.concat([X_train_conti, X_train_category], axis=1)
X_test = pd.concat([X_test_conti, X_test_category], axis=1)

In [15]:
X_train.isnull().sum()


MinTemp         0
MaxTemp         0
Rainfall        0
WindSpeed9am    0
Humidity9am     0
Temp9am         0
Location        0
RainToday       0
dtype: int64

In [17]:
from sklearn.model_selection import train_test_split 

X_TRAIN, X_VAL, y_TRAIN, y_VAL = train_test_split(
  X_train, 
  y_train, 
  random_state=2024, 
  test_size=0.3
)

X_train.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindSpeed9am,Humidity9am,Temp9am,Location,RainToday
0,20.4,37.6,0.0,0.0,46.0,26.1,Albury,No
1,20.9,33.6,0.4,9.0,54.0,24.8,Albury,No
2,18.4,23.1,2.2,11.0,62.0,21.8,Albury,Yes
3,17.3,23.7,15.6,9.0,74.0,19.2,Albury,Yes
4,15.5,22.9,6.8,6.0,92.0,17.2,Albury,Yes


In [19]:
# 인코딩 

from sklearn.preprocessing import OneHotEncoder

X_TRAIN_cate = X_TRAIN.select_dtypes(include= "object").copy()
X_VAL_cate = X_VAL.select_dtypes(include= "object").copy()
X_TEST_cate = X_test.select_dtypes(include="object").copy() 

# 희소행렬: 값이 대부분이 0인 행렬
# 밀집행렬: 원-핫인코딩에서 0이 많아지더라도 모든값이 포함된 행렬 
enc = OneHotEncoder(sparse_output=False).fit(X_TRAIN_cate)

X_TRAIN_OH = enc.transform(X_TRAIN_cate)
X_VAL_OH = enc.transform(X_VAL_cate)
X_TEST_OH = enc.transform(X_TEST_cate)

In [21]:
# 스케일링 
from sklearn.preprocessing import StandardScaler 

X_TRAIN_conti = X_TRAIN.select_dtypes(exclude="object").copy() 
X_VAL_conti = X_VAL.select_dtypes(exclude="object").copy() 
X_TEST_conti = X_test.select_dtypes(exclude="object").copy() 

scale = StandardScaler().fit(X_TRAIN_conti)

X_TRAIN_STD = scale.transform(X_TRAIN_conti)
X_VAL_STD = scale.transform(X_VAL_conti)
X_TEST_STD = scale.transform(X_TEST_conti)

In [22]:
import numpy as np 

X_TRAIN = np.concatenate([X_TRAIN_OH, X_TRAIN_STD], axis=1)
X_VAL = np.concatenate([X_VAL_OH, X_VAL_STD], axis=1)

# 레이블링 작업 
# roc_curve 했을때 posivie label 작업 

y_TRAIN = y_TRAIN["RainTomorrow"].map({"No":0, "Yes":1})
y_VAL = y_VAL["RainTomorrow"].map({"No":0, "Yes":1})

# 평탄화 
y_TRAIN = y_TRAIN.values.ravel()
y_VAL = y_VAL.values.ravel()


In [24]:
## 랜덤포레스트 

from sklearn.ensemble import RandomForestClassifier 
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier 
from sklearn.metrics import roc_curve, auc

rf = RandomForestClassifier(
  n_estimators=500, 
  max_depth=3, 
  min_samples_leaf=10, 
  max_features="sqrt", 
  random_state=2024
)

rf.fit(X_TRAIN, y_TRAIN)
score_rf = rf.predict_proba(X_VAL)[:, 1]

fpr, tpr, thresholds = roc_curve(y_VAL, score_rf)
auc_rf = auc(fpr, tpr)
print(auc_rf)

0.7368425612685312


In [28]:
xgb = XGBClassifier(
  max_depth=8, 
  n_estimators=500, 
  nthread=5, 
  min_child_weight=20, 
  gamma=0.5, 
  objective="binary:logistic", 
  # use_label_encoder=False,
  random_state=2024
)
xgb.fit(X_TRAIN, y_TRAIN)
score_xgb = xgb.predict_proba(X_VAL)[:, 1]

# 
fpr, tpr, thresholds = roc_curve(y_VAL, score_xgb)
auc_xgb = auc(fpr, tpr)
print(auc_xgb)

0.7686860244923059


In [30]:
# LightGBZM 

lgb = LGBMClassifier(
  max_depth=8, 
  n_estimators=500, 
  n_jobs=30, 
  min_child_weight=10, 
  learning_rate=0.2, 
  objective="binary", 
  random_state=2024
)

lgb.fit(X_TRAIN, y_TRAIN)
score_lgb = lgb.predict_proba(X_VAL)[:, 1]

# 
fpr, tpr, thresholds = roc_curve(y_VAL, score_lgb)
auc_lgb = auc(fpr, tpr)
print(auc_lgb)

[LightGBM] [Info] Number of positive: 1990, number of negative: 6209
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001260 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1227
[LightGBM] [Info] Number of data points in the train set: 8199, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242713 -> initscore=-1.137865
[LightGBM] [Info] Start training from score -1.137865
0.7597515579832814


In [31]:
X_TEST = np.concatenate([X_TEST_OH, X_TEST_STD], axis=1)
y_score =xgb.predict_proba(X_TEST)[:, 1]

obj = {
  "Date": Date, 
  "RainTomorrow_prob": y_score
}

result = pd.DataFrame(obj)
result.to_csv("연습문제3.csv", index=False)