In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import pandas as pd
from PIL import Image

df = pd.read_csv('scikit_test.csv')

# 키워드를 '/'로 분리
df['keywords'] = df['keywords'].str.strip('()').str.split('/')
df

Unnamed: 0,도로,큰도로,숲,공원,운동장,주거지역,물,건물,학교,keywords
0,23.27,71.5,3.75,0.0,0.0,0.01,1.48,0.0,0.0,[큰도로]
1,22.29,18.73,0.0,12.25,0.0,44.06,0.0,2.16,0.51,"[도로, 큰도로, 주거지역]"
2,2.75,5.98,67.78,0.09,0.0,12.02,0.0,4.96,6.43,[숲]
3,3.18,3.32,0.0,38.61,0.0,9.27,36.89,8.73,0.0,"[공원, 물]"
4,19.87,18.78,0.0,0.0,15.29,0.0,8.76,23.44,13.87,"[도로, 큰도로, 운동장, 건물, 학교]"
5,18.17,15.84,0.0,0.83,0.0,0.0,0.0,65.16,0.0,[건물]
6,46.77,12.53,0.0,0.0,0.0,0.0,28.6,12.1,0.0,"[도로, 물]"
7,50.5,0.0,0.0,20.03,4.33,0.0,0.0,19.16,5.99,"[도로, 공원, 건물]"
8,32.15,5.43,0.0,0.0,0.0,0.0,0.0,24.17,38.25,"[도로, 건물, 학교]"
9,33.35,37.79,0.0,0.0,0.0,20.21,0.0,8.65,0.0,"[도로, 큰도로, 주거지역]"


In [3]:
# 데이터 준비
X = df.drop('keywords', axis=1)
y = df['keywords']

# 레이블을 이진 형식으로 변환
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)

# 데이터를 학습 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [6]:
keywords = {(255,255,255):"도로", (249,178,156):"큰도로", (252, 214, 164):"큰도로", (247, 250, 191):"큰도로", (173, 209, 158):"숲",
                (200, 250, 204):"공원", (170, 224, 203):"운동장", (224, 223, 223):"주거지역", (170, 211, 223):"물", (217, 208, 201):"건물", (255, 255, 229):"학교"}

def select_img(image_path):
    color_count = count_pixel_colors(image_path)
    return color_ratio_measure(color_count)

# 각 rgb 값 세는 함수
def count_pixel_colors(image_path):
    image = Image.open(image_path)
    image = image.convert('RGBA')  # 이미지를 RGBA 형식으로 변환
    image_data = image.getdata()

    color_count = {}

    for pixel in image_data:
        r, g, b, a = pixel
        if (r, g, b) not in keywords:
            continue
        index = keywords[(r, g, b)]
        if a == 255:
            if index in color_count:
                color_count[index] += 1
            else:
                color_count[index] = 1
    # del color_count[(242, 239, 233)]  # 공백 컬러 삭제

    return color_count

# 비율 측정
def color_ratio_measure(color_count):
    indexes = {"도로":0, "큰도로":0, "숲":0, "공원":0, "운동장":0, "주거지역":0, "물":0, "건물":0, "학교":0}

    total_pixels = sum(color_count.values())
    color_ratio = {color: count / total_pixels for color,
                count in color_count.items()}
    for color, ratio in color_ratio.items():
        print(f"{color}: {ratio * 100:.2f}%")
        indexes[color] = round(ratio*100, 2)

    return indexes

In [22]:
# 새로운 데이터에 대한 예측
new_data = pd.DataFrame([select_img('image.png')]) # 새로운 데이터를 pandas DataFrame 형식으로 준비

y_new = clf.predict(new_data)

# 예측된 레이블을 원래의 형식으로 변환
predicted_labels = mlb.inverse_transform(y_new)
predicted_labels

도로: 25.92%
건물: 9.65%
큰도로: 10.44%
주거지역: 34.44%
공원: 3.18%
학교: 15.33%
물: 1.05%


[('도로', '주거지역')]