# 과의존 데이터로 분류 학습 정확도 확인

In [1]:
import pandas as pd
import pymysql
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


## 테스트는 성인(DM1 == 3) 인 데이터로 진행(양이 가장 많기 때문에)

#### DB 접속정보 정의

In [2]:
# db 접속정보 설정
DB_HOST = ''
DB_PORT = 3306
DB_USER = ''
DB_PASS = ''
DB_NAME = ''

#### DB 에서 데이터 조회하여 DataFrame 으로 변환

In [4]:
# db 접속
conn = pymysql.connect(host = DB_HOST, port = DB_PORT, user = DB_USER, passwd = DB_PASS, db = DB_NAME, cursorclass=pymysql.cursors.DictCursor)
cur = conn.cursor()

sql = """
SELECT b.* 
  FROM respondent a
 INNER JOIN response_fit b on a.ID = b.ID 
 WHERE a.DM1 = 3
"""
cur.execute(sql)

result = cur.fetchall()
df = pd.DataFrame(result)

# db 연결 종료
conn.commit()    
conn.close()

#### feature 와 label 분리

In [10]:
df_y = df['KK1']
df_X = df.drop(['ID', 'KK1'], axis=1)

#### train set 와 test set 분할

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_X.values, df_y.values)

#### 정확도 계산결과 확인

In [15]:
lr_c = LogisticRegression(max_iter=4000)
lr_c.fit(X_train, y_train)
pred = lr_c.predict(X_test)
accuracy_score(y_test, pred)

0.9987080103359173

### 다른 그룹 정확도를 확인하기 위해 함수를 만들어서 처리

In [16]:
def accuracy(dm1):
    
    # DB 에서 데이터 읽어오기
    # db 접속
    conn = pymysql.connect(host = DB_HOST, port = DB_PORT, user = DB_USER, passwd = DB_PASS, db = DB_NAME, cursorclass=pymysql.cursors.DictCursor)
    cur = conn.cursor()

    sql = """
    SELECT b.* 
      FROM respondent a
     INNER JOIN response_fit b on a.ID = b.ID 
     WHERE a.DM1 = {}
    """.format(dm1)
    cur.execute(sql)

    result = cur.fetchall()
    df = pd.DataFrame(result)

    # db 연결 종료
    conn.commit()    
    conn.close()
    
    # feature 와 label 분리
    df_y = df['KK1']
    df_X = df.drop(['ID', 'KK1'], axis=1)
    
    # 1그룹의 경우 문항이 9개밖에 없으므로 Q3_10 컬럼 제거해주어야 함
    if dm1 == 1:
        df_X = df_X.drop(['Q3_10'], axis=1)
    
    # train set, test set 분리
    X_train, X_test, y_train, y_test = train_test_split(df_X.values, df_y.values)
    
    # 정확도 계산결과 확인
    lr_c = LogisticRegression(max_iter=4000)
    lr_c.fit(X_train, y_train)
    pred = lr_c.predict(X_test)
    score = accuracy_score(y_test, pred)
    
    print('{} 그룹 정확도 : {}'.format(dm1, score))
    
    

In [17]:
accuracy(1)
accuracy(2)
accuracy(3)
accuracy(4)

1 그룹 정확도 : 1.0
2 그룹 정확도 : 0.9885433715220949
3 그룹 정확도 : 0.9987080103359173
4 그룹 정확도 : 0.998003992015968


In [None]:
------------------------------------- 이 아래로 최초 작업 ----------------------------------------------- 

In [2]:
# 파일 읽기
df = pd.read_csv('./data/data.csv')

- DM1 : 대상 기준

- A 그룹 : 1 : 유아동, 9문항
- B 그룹 : 2 : 청소년, 10문항
- C 그룹 : 3 : 성인, 10문항
- D 그룹 : 4 : 60대, 10문항

In [4]:
# 성인 데이터 추출 테스트
df_c = df[df['DM1'] == 3]
df_c

Unnamed: 0,ID,DM1,DM2,DM3,DM4,KK1,KK2,WT,Q1A_1,Q1A_2,...,Q20_2,Q20_3,Q20_4,Q20_5,Q20_6,Q20_7,Q20_8,Q20_9,Q20_10,Q21
0,1,3,1,,1,3,,1581.361297,1,1,...,3.0,3.0,2.0,3.0,3.0,2.0,3.0,3.0,3.0,4.0
1,2,3,2,,1,3,,1506.828478,1,1,...,3.0,3.0,2.0,3.0,2.0,3.0,3.0,2.0,2.0,1.0
4,5,3,1,,1,3,,1590.508696,1,1,...,3.0,3.0,3.0,2.0,2.0,3.0,2.0,3.0,3.0,1.0
5,6,3,2,,1,3,,1602.580332,1,1,...,2.0,3.0,3.0,2.0,2.0,3.0,2.0,3.0,3.0,3.0
8,9,3,1,,1,3,,1590.508696,1,1,...,3.0,3.0,2.0,2.0,3.0,2.0,3.0,3.0,3.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28567,28651,3,2,,2,3,,988.783217,1,1,...,3.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,3.0,1.0
28569,28653,3,1,,1,3,,1284.716981,1,0,...,3.0,3.0,2.0,3.0,2.0,3.0,3.0,4.0,4.0,2.0
28570,28654,3,2,,1,3,,1165.215686,1,0,...,3.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0,3.0,1.0
28573,28657,3,1,,3,2,,1485.539561,1,0,...,,,,,,,,,,


In [5]:
df_c = df_c.reset_index()

In [6]:
# C 그룹 답변 데이터 추출
l = ['Q3C_' + str(i) for i in range(1, 11)]
l

['Q3C_1',
 'Q3C_2',
 'Q3C_3',
 'Q3C_4',
 'Q3C_5',
 'Q3C_6',
 'Q3C_7',
 'Q3C_8',
 'Q3C_9',
 'Q3C_10']

In [8]:
 # feature
df_c_X = df_c[l]
df_c_X

Unnamed: 0,Q3C_1,Q3C_2,Q3C_3,Q3C_4,Q3C_5,Q3C_6,Q3C_7,Q3C_8,Q3C_9,Q3C_10
0,2.0,3.0,3.0,3.0,2.0,1.0,2.0,2.0,3.0,2.0
1,2.0,3.0,3.0,3.0,2.0,1.0,1.0,2.0,3.0,2.0
2,2.0,3.0,3.0,3.0,3.0,1.0,2.0,2.0,2.0,2.0
3,2.0,2.0,2.0,3.0,1.0,2.0,2.0,3.0,3.0,2.0
4,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
18570,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,1.0
18571,2.0,3.0,2.0,3.0,2.0,3.0,1.0,2.0,1.0,3.0
18572,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0
18573,3.0,2.0,3.0,4.0,3.0,3.0,2.0,3.0,3.0,2.0


In [9]:
df_c_y = df_c['KK1'] # label
df_c_y

0        3
1        3
2        3
3        3
4        3
        ..
18570    3
18571    3
18572    3
18573    2
18574    3
Name: KK1, Length: 18575, dtype: int64

In [10]:
type(df_c_X.values)

numpy.ndarray

In [67]:
X_train, X_test, y_train, y_test = train_test_split(df_c_X.values, df_c_y.values)

In [77]:
lr_c = LogisticRegression(max_iter=4000)
lr_c.fit(X_train, y_train)
pred = lr_c.predict(X_test)
accuracy_score(y_test, pred)

0.9982773471145564

In [78]:
yjc_test = [
    [3, 2, 2, 2, 2, 2, 1, 3, 1, 1]
]

yjc_pred = lr_c.predict(yjc_test)
yjc_pred

array([3], dtype=int64)

In [4]:
df_c = df[df['DM1'] == 3]
df_c = df_c.reset_index()
df_c

Unnamed: 0,index,ID,DM1,DM2,DM3,DM4,KK1,KK2,WT,Q1A_1,...,Q20_2,Q20_3,Q20_4,Q20_5,Q20_6,Q20_7,Q20_8,Q20_9,Q20_10,Q21
0,0,1,3,1,,1,3,,1581.361297,1,...,3.0,3.0,2.0,3.0,3.0,2.0,3.0,3.0,3.0,4.0
1,1,2,3,2,,1,3,,1506.828478,1,...,3.0,3.0,2.0,3.0,2.0,3.0,3.0,2.0,2.0,1.0
2,4,5,3,1,,1,3,,1590.508696,1,...,3.0,3.0,3.0,2.0,2.0,3.0,2.0,3.0,3.0,1.0
3,5,6,3,2,,1,3,,1602.580332,1,...,2.0,3.0,3.0,2.0,2.0,3.0,2.0,3.0,3.0,3.0
4,8,9,3,1,,1,3,,1590.508696,1,...,3.0,3.0,2.0,2.0,3.0,2.0,3.0,3.0,3.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18570,28567,28651,3,2,,2,3,,988.783217,1,...,3.0,3.0,2.0,3.0,2.0,3.0,3.0,3.0,3.0,1.0
18571,28569,28653,3,1,,1,3,,1284.716981,1,...,3.0,3.0,2.0,3.0,2.0,3.0,3.0,4.0,4.0,2.0
18572,28570,28654,3,2,,1,3,,1165.215686,1,...,3.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0,3.0,1.0
18573,28573,28657,3,1,,3,2,,1485.539561,1,...,,,,,,,,,,


----

In [79]:
# 유아동 데이터 추출 테스트
df_a = df[df['DM1'] == 1]

In [80]:
# A 그룹 답변 데이터 추출
l = ['Q3A_' + str(i) for i in range(1, 10)]
df_a_X = df_a[l] # feature
df_a_y = df_a['KK1'] # label

In [89]:
X_train, X_test, y_train, y_test = train_test_split(df_a_X.values, df_a_y.values)

In [90]:
lr_a = LogisticRegression(max_iter=4000)
lr_a.fit(X_train, y_train)
pred = lr_a.predict(X_test)
accuracy_score(y_test, pred)

0.9974293059125964

In [91]:
yjc_test = [
    [3, 3, 3, 3, 4, 4, 3, 2, 3]
]

yjc_pred = lr_a.predict(yjc_test)
yjc_pred

array([2], dtype=int64)