# 🤷‍♀️Genetic Algorithm🤷‍♂️  

Genetic Algorithm은 차원 축소 기법 중 변수 선택법에 속하는 알고리즘이다.  
Genetic Algorithm은 자연 선택설을 기반으로 한 진화 알고리즘으로, 우수한 유전자가 다음 세대에서도 잘 발현될 수 있도록 학습해나간다.  
  
알고리즘은 다음 6가지 절차를 따른다.  
1. 염색체 초기화
2. 모델 학습
3. 적합도 평가
4. 부모 염색체 선택
5. 자식 염색체 생성
6. 최적 변수 집합 선택

In [12]:
# 필요한 라이브러리 불러옴

import pandas as pd
import random
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

활용 데이터 - Energy Efficiency dataset  
  
https://archive.ics.uci.edu/ml/datasets/Energy+efficiency  
  
We perform energy analysis using 12 different building shapes simulated in Ecotect. The buildings differ with respect to the glazing area, the glazing area distribution, and the orientation, amongst other parameters. We simulate various settings as functions of the afore-mentioned characteristics to obtain 768 building shapes. The dataset comprises 768 samples and 8 features, aiming to predict two real valued responses. It can also be used as a multi-class classification problem if the response is rounded to the nearest integer.  
  
X1 : Relative Compactness  
X2 : Surface Area  
X3 : Wall Area  
X4 : Roof Area  
X5 : Overall Height  
X6 : Orientation  
X7 : Glazing Area  
X8 : Glazing Area Distribution  
Y1 : Heating Load  

In [3]:
dataset = pd.read_csv('energy_efficiency.csv')
dataset = dataset.iloc[:768,:-1]
dataset

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1
0,0.98,514.5,294.0,110.25,7.0,2.0,0.0,0.0,15.55
1,0.98,514.5,294.0,110.25,7.0,3.0,0.0,0.0,15.55
2,0.98,514.5,294.0,110.25,7.0,4.0,0.0,0.0,15.55
3,0.98,514.5,294.0,110.25,7.0,5.0,0.0,0.0,15.55
4,0.90,563.5,318.5,122.50,7.0,2.0,0.0,0.0,20.84
...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5.0,0.4,5.0,17.88
764,0.62,808.5,367.5,220.50,3.5,2.0,0.4,5.0,16.54
765,0.62,808.5,367.5,220.50,3.5,3.0,0.4,5.0,16.44
766,0.62,808.5,367.5,220.50,3.5,4.0,0.4,5.0,16.48


In [4]:
x = dataset.drop('Y1', axis=1)
y = dataset['Y1']

- 1단계 : 염색체 초기화

In [5]:
# 유전자마다 난수를 생성해서 이진 인코딩

generation = []

for j in range(6) : # population size = 6
    chromosome = []
    
    for i in range(8) : # 변수 개수
        a = random.random()
        a = round(a, 2)
        chromosome.append(a)
        
    generation.append(chromosome)

generation = pd.DataFrame(generation)
generation

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.86,0.11,0.72,0.9,0.81,0.96,0.61,0.93
1,0.75,0.56,0.43,0.09,0.83,0.85,0.62,0.8
2,0.96,0.36,0.3,0.2,0.13,0.8,0.97,0.29
3,0.14,0.9,0.5,0.83,0.53,0.71,0.4,0.7
4,0.98,0.58,0.4,0.08,0.37,0.67,0.03,0.41
5,0.81,0.45,0.07,0.05,0.66,0.52,0.22,0.71


In [6]:
# cut-off = 0.5

generation[generation > 0.5] = 1
generation[generation <= 0.5] = 0
generation = generation.astype('int')
generation = generation.values.tolist()
generation

[[1, 0, 1, 1, 1, 1, 1, 1],
 [1, 1, 0, 0, 1, 1, 1, 1],
 [1, 0, 0, 0, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1],
 [1, 1, 0, 0, 0, 1, 0, 0],
 [1, 0, 0, 0, 1, 1, 0, 1]]

- 2단계 : 모델 학습

In [15]:
fitness = []

for k in range(6) :
    function_inputs = generation[k]

    col_data = []
    col = 0
    for i in function_inputs :
        if i == 1 :
            col_data.append(col)
        col += 1
    col_data

    new_x = x.iloc[:,col_data] # 각 염색체별로 사용하는 변수 데이터만 가져와서

    model = LinearRegression() # 다중 선형 회귀 모델로 분석하여
    model.fit(new_x, y)
    y_pred = model.predict(new_x)
    r2 = r2_score(y, y_pred)
    adj_r2 = 1-((768-1)*(1-r2))/(768-8-1) # 적합도 구함
    fitness.append(adj_r2)

In [16]:
fitness

[0.9153188366810866,
 0.9059949353984598,
 0.4543522773769625,
 0.8440887154833302,
 0.48549823117189017,
 0.8382838281900078]

- 3단계 : 적합도 평가

In [47]:
# 적합도의 weight(전체 중에서 비중) 구함

weight = []
for i in range(len(fitness)) :
    a = fitness[i]/sum(fitness)
    a = round(a, 3)
    weight.append(a)
weight

[0.206, 0.204, 0.102, 0.19, 0.109, 0.189]

In [48]:
fit_df = pd.DataFrame(zip(fitness, weight))
fit_df.columns = ['fitness', 'weight']
fit_df

Unnamed: 0,fitness,weight
0,0.915319,0.206
1,0.905995,0.204
2,0.454352,0.102
3,0.844089,0.19
4,0.485498,0.109
5,0.838284,0.189


In [49]:
# 적합도의 rank(전체 중에서 순위) 구함

fit_df['rank'] = fit_df['fitness'].rank(ascending=False)
fit_df

Unnamed: 0,fitness,weight,rank
0,0.915319,0.206,1.0
1,0.905995,0.204,2.0
2,0.454352,0.102,6.0
3,0.844089,0.19,3.0
4,0.485498,0.109,5.0
5,0.838284,0.189,4.0


- 4단계 : 부모 염색체 선택

In [50]:
# 확정적 선택
# N = 50(적합도가 상위 50% 이상인 것)
# rank 사용

parents_idx = fit_df[fit_df['rank']<=3.0].index.values.tolist()

generation_df = pd.DataFrame(generation)
parents = generation_df.iloc[parents_idx,:]
parents

Unnamed: 0,0,1,2,3,4,5,6,7
0,1,0,1,1,1,1,1,1
1,1,1,0,0,1,1,1,1
3,0,1,0,1,1,1,0,1


In [51]:
# 확률적 선택
# weight 사용

weight_sum = []
a = 0

for i in weight :
    a = a+i
    weight_sum.append(a)

weight_sum # 선택될 가능성이 가중치와 비례

[0.206, 0.41, 0.512, 0.702, 0.8109999999999999, 1.0]

In [52]:
rn = []
for i in range(2) :
    a = random.random() # 난수 생성
    a = round(a, 3)
    rn.append(a)
rn

[0.959, 0.533]

In [53]:
# 난수가 해당하는 범위의 염색체 선택

pa = []
for i in rn :
    if i>0 and i<weight_sum[0] :
        pa.append(0)
    elif i>weight_sum[0] and i<weight_sum[1] :
        pa.append(1)
    elif i>weight_sum[1] and i<weight_sum[2] :
        pa.append(2)
    elif i>weight_sum[2] and i<weight_sum[3] :
        pa.append(3)
    elif i>weight_sum[3] and i<weight_sum[4] :
        pa.append(4)
    elif i>weight_sum[4] and i<weight_sum[5] :
        pa.append(5)
pa

[5, 3]

In [55]:
generation_df = pd.DataFrame(generation)
parents = generation_df.iloc[pa,:]
parents

Unnamed: 0,0,1,2,3,4,5,6,7
5,1,0,0,0,1,1,0,1
3,0,1,0,1,1,1,0,1


- 5단계 : 자식 염색체 생성

In [57]:
# 교배
# crossover point = 1

cut = random.randint(0,7) # 교배 지점 선택
cut

6

In [80]:
child_1 = parents.iloc[0,:cut].tolist() + parents.iloc[1,cut:].tolist()
child_2 = parents.iloc[1,:cut].tolist() + parents.iloc[0,cut:].tolist()
child = pd.DataFrame([child_1, child_2])
child

Unnamed: 0,0,1,2,3,4,5,6,7
0,1,0,0,0,1,1,0,1
1,0,1,0,1,1,1,0,1


In [92]:
# 돌연변이
# mutation rate = 0.01

random_no = []

for j in range(2) :
    chromosome = []
    
    for i in range(8) :
        a = random.random() # 난수 생성
        a = round(a, 2)
        chromosome.append(a)
        
    random_no.append(chromosome)

random_no

[[0.26, 0.01, 0.5, 0.43, 0.13, 0.43, 0.61, 0.33],
 [0.8, 0.01, 0.94, 0.44, 0.19, 0.08, 0.5, 0.98]]

In [103]:
aaaa = []
for i in random_no :
    for j in i :
        aaaa.append(j)

bbbb = []
for i in child.values.tolist() :
    for j in i :
        bbbb.append(j)

m = []
n = 0
for i in aaaa :
    if i <= 0.01 : # 난수가 0.01보다 작은 위치 저장
        m.append(n)
    n += 1

for i in m : # 해당 위치 변수 반대로 바꿔줌
    if bbbb[i] == 0 :
        bbbb[i] = 1
    else :
        bbbb[i] = 0

In [104]:
final_child = [bbbb[:8],bbbb[8:]]
final_child

[[1, 1, 0, 0, 1, 1, 0, 1], [0, 0, 0, 1, 1, 1, 0, 1]]

- 6단계 : 최적 변수 집합 선택

In [106]:
# 학습 종료 후 가장 높은 적합도 보이는 조합 찾기

fitness = []

for k in range(2) :
    function_inputs = final_child[k]

    col_data = []
    col = 0
    for i in function_inputs :
        if i == 1 :
            col_data.append(col)
        col += 1
    col_data

    new_x = x.iloc[:,col_data] # 각 염색체별로 사용하는 변수 데이터만 가져와서

    model = LinearRegression() # 다중 선형 회귀 모델로 분석하여
    model.fit(new_x, y)
    y_pred = model.predict(new_x)
    r2 = r2_score(y, y_pred)
    adj_r2 = 1-((768-1)*(1-r2))/(768-8-1) # 적합도 구함
    fitness.append(adj_r2)

In [107]:
fitness

[0.839180423678257, 0.7967906519854066]

In [109]:
# 최적 변수 조합
if fitness[0] > fitness[1] :
    print(final_child[0])
else :
    print(final_child[1])

[1, 1, 0, 0, 1, 1, 0, 1]
