In [1]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

In [2]:
import pandas as pd
import numpy as np

In [3]:
# 데이터 불러오기
train = pd.read_csv('multi-train.csv')
test = pd.read_csv('binary-test.csv')

In [4]:
train.describe()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
count,2112.0,2112.0,2112.0,2112.0,2112.0,2112.0,2112.0,2112.0,2112.0,2112.0
mean,18.422348,69.244318,9.86411,54.307765,1005.887827,3.911932,2.463068,5.426136,1.11553,1.533144
std,17.320088,20.029442,6.990655,32.213802,36.385233,3.822173,1.636729,3.393991,0.787699,1.12541
min,-22.0,20.0,0.0,0.0,802.47,0.0,0.0,0.0,0.0,0.0
25%,3.0,58.0,5.0,20.0,993.96,1.0,1.0,3.0,0.0,1.0
50%,20.0,70.0,9.0,59.0,1007.275,3.0,3.0,5.0,1.0,2.0
75%,30.0,84.0,13.5,83.0,1016.575,7.0,4.0,7.5,2.0,3.0
max,97.0,109.0,46.5,109.0,1199.21,14.0,4.0,20.0,2.0,3.0


In [5]:
train.dtypes

Temperature               int64
Humidity                  int64
Wind Speed              float64
Precipitation (%)         int64
Atmospheric Pressure    float64
UV Index                  int64
Season                    int64
Visibility (km)         float64
Location                  int64
Weather Type              int64
dtype: object

In [6]:
train.isnull().sum()

Temperature             0
Humidity                0
Wind Speed              0
Precipitation (%)       0
Atmospheric Pressure    0
UV Index                0
Season                  0
Visibility (km)         0
Location                0
Weather Type            0
dtype: int64

# 데이터 전처리

In [8]:
train_x = train.iloc[:,:-2] # 'location' 열 제거
train_x

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Season,Visibility (km)
0,11,96,6.0,69,990.04,3,0,1.0
1,-22,84,29.5,70,986.24,6,4,2.5
2,-6,70,18.5,83,999.96,0,4,3.0
3,24,38,3.5,16,1010.72,5,3,9.0
4,26,98,12.0,63,1008.53,3,4,2.5
...,...,...,...,...,...,...,...,...
2107,25,99,20.5,90,1004.14,0,4,2.5
2108,23,69,8.5,36,1015.73,1,1,9.0
2109,17,25,4.0,12,975.85,5,3,12.5
2110,1,98,2.0,81,980.59,0,4,2.0


In [9]:
train_y = train.iloc[:,-1] 
train_y

0       2
1       3
2       3
3       0
4       2
       ..
2107    2
2108    1
2109    0
2110    3
2111    0
Name: Weather Type, Length: 2112, dtype: int64

In [10]:
test_x = test.iloc[:,:-2]
test_x

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Season,Visibility (km)
0,-7,62,16.5,51,996.48,0,4,3.5
1,-9,65,13.5,85,986.87,1,4,2.0
2,18,55,11.0,34,1003.46,1,1,7.0
3,-7,88,18.0,56,990.88,1,4,5.0
4,23,41,8.0,18,1027.83,7,3,9.5
...,...,...,...,...,...,...,...,...
523,11,67,15.0,88,1000.85,2,3,1.5
524,29,77,0.5,91,934.49,14,3,17.0
525,43,29,2.0,61,855.21,1,3,13.0
526,19,53,2.5,65,1191.95,5,3,17.5


In [11]:
test_y = test.iloc[:,-1]
test_y

0      1
1      1
2      1
3      1
4      0
      ..
523    1
524    0
525    1
526    1
527    0
Name: Weather Type, Length: 528, dtype: int64

# 이진 분류 처리

In [13]:
t_y = pd.Series() 

In [14]:
for i in range(2112):
    if train_y[i] != 0:
        t_y[i] = 1
    else:
        t_y[i] = 0


In [15]:
t_y

0       1
1       1
2       1
3       0
4       1
       ..
2107    1
2108    1
2109    0
2110    1
2111    0
Length: 2112, dtype: int64

In [16]:
train_y

0       2
1       3
2       3
3       0
4       2
       ..
2107    2
2108    1
2109    0
2110    3
2111    0
Name: Weather Type, Length: 2112, dtype: int64

# 모델 구축하기

In [18]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=100)
model.fit(train_x, t_y)
y_predict = model.predict(test_x)

In [19]:
print(y_predict)

[1 1 1 1 0 0 1 1 0 1 1 1 1 0 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 0 1 0 1 0
 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 1 0 1 1 0 1 0 0 1 0 1
 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1
 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0 0 0 1 0 1 1 1 1 1 1 0 1
 1 0 0 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 0
 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1
 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 0
 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 0 1 0 1 1 1 0 0 0 1 0 1 1 1 1 1 1 0
 1 1 0 1 0 1 0 1 0 1 1 0 1 1 1 0 1 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 0 1 1 1 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1
 1 1 1 1 0 1 0 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1
 1 1 0 1 1 1 1 1 1 1 0 1 

# f1_score

In [21]:
f1 = f1_score(test_y,y_predict,average='macro')
print(f'f1 score = {f1}')

f1 score = 0.9290615957007027


# accuracy

In [23]:
acc = accuracy_score(test_y,y_predict)
print(f'accuracy_score = {acc}')

accuracy_score = 0.9507575757575758
