In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/data.csv')

In [None]:
data.head()

Unnamed: 0,learnerID,assessmentItemID,answerCode,theta,difficultyLevel,discriminationLevel,guessLevel,knowledgeTag,gender,grade
0,A060000034,A060014001,1,0.574639,-1.479204,2.693081,3.4e-09,7229,M,6
1,A060000034,A060014002,1,0.574639,-1.347461,4.063459,3.62e-08,7229,M,6
2,A060000034,A060014003,1,0.574639,0.566275,1.834214,9.699e-07,7229,M,6
3,A060000034,A060014004,1,0.574639,-0.361558,1.263308,8.77e-08,7229,M,6
4,A060000034,A060014005,0,0.574639,0.955883,2.052489,1.7e-08,7229,M,6


### **DATA**
[시간 순 정렬된 데이터]
- learnerID : 학생 ID
- assessmentItemID : 문제 ID
- answerCode : 채점결과 (0:틀림, 1:맞음)
- theta : testID 에 대한 응시자의 능력 수준 (-5~5)
- difficultyLevel : 문제 난이도 (-5~5)
- discriminationLevel : 변별도 (0~∞)
- guessLevel : 추측도 (0~1)
- knowledgeTag : 지식체계번호
- gender : 성별 (M/F)
- grade : 학년 (1~9)

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing.sequence import pad_sequences

data = pd.read_csv('/content/drive/MyDrive/data.csv')

# 데이터 전처리
data['learnerID'] = data['learnerID'].str[-9:]
data['assessmentItemID'] = data['assessmentItemID'].str[-9:]
data['learnerID'] = data['learnerID'].astype(int)
data['assessmentItemID'] = data['assessmentItemID'].astype(int)
data = data.drop(columns=['theta'])
data.replace({'gender':{'F' : 1, 'M' : 0}}, inplace = True)
data = data[data.groupby('learnerID')['learnerID'].transform('count') >= 20]


# 시간 순서 정렬 후, train, validation, test 셋 분리
# 각 사용자의 마지막 행을 기준으로 테스트 세트 구성
test = data.groupby('learnerID').tail(1)

# 테스트 세트를 제외한 데이터로부터 검증 세트 구성
data2 = data.drop(test.index)
val = data2.groupby('learnerID').tail(1)

# 검증 세트와 테스트 세트를 제외한 데이터가 훈련 세트가 됨
train = data2.drop(val.index)


In [None]:
test.head()

Unnamed: 0,learnerID,assessmentItemID,answerCode,difficultyLevel,discriminationLevel,guessLevel,knowledgeTag,gender,grade
3324,60000620,60139007,0,0.868008,1.229088,0.0,1450,0,6
6842,20000149,20036009,1,-0.692984,2.127067,2.953006e-46,7696,0,2
8839,50000284,50010007,1,0.653254,2.407144,2.8512960000000004e-28,2619,1,5
10349,70000196,70004009,0,1.321834,3.438357,0.003172092,3794,0,7
10432,10001375,10016005,1,-1.564088,1.963291,7.89e-08,7593,0,1


In [None]:
val.head()

Unnamed: 0,learnerID,assessmentItemID,answerCode,difficultyLevel,discriminationLevel,guessLevel,knowledgeTag,gender,grade
3323,60000620,60139006,1,-1.46928,2.323736,0.0001114099,1450,0,6
6826,20000149,20036008,1,-1.427714,1.47213,3.31e-07,7696,0,2
8836,50000284,50010006,0,1.318017,1.415418,0.3,2619,1,5
10344,70000196,70004008,0,0.613792,21.174533,0.1103776,3793,0,7
10431,10001375,10016004,1,-1.460198,2.106698,1.36e-08,7593,0,1


In [None]:
train.head(3165)

Unnamed: 0,learnerID,assessmentItemID,answerCode,difficultyLevel,discriminationLevel,guessLevel,knowledgeTag,gender,grade
0,60000034,60014001,1,-1.479204,2.693081,3.400000e-09,7229,0,6
1,60000034,60014002,1,-1.347461,4.063459,3.620000e-08,7229,0,6
2,60000034,60014003,1,0.566275,1.834214,9.699000e-07,7229,0,6
3,60000034,60014004,1,-0.361558,1.263308,8.770000e-08,7229,0,6
4,60000034,60014005,0,0.955883,2.052489,1.700000e-08,7229,0,6
...,...,...,...,...,...,...,...,...,...
3321,60000620,60139005,1,-1.574519,2.127554,2.999150e-04,1450,0,6
3322,50000579,50014004,1,-0.712541,2.968104,3.824310e-05,2617,1,5
3325,50000579,50014005,1,-0.426344,5.740237,6.347800e-06,2617,1,5
3326,20000595,20064001,1,-1.554687,1.159451,1.235000e-06,7928,0,2


In [4]:

# 데이터를 시계열(sequence) 형태로 변환
def create_sequences(df, seq_length=19):
    features = df.drop(columns=['learnerID', 'answerCode']).columns
    sequences = []
    for user_id in df['learnerID'].unique():
        user_data = df[df['learnerID'] == user_id]
        if len(user_data) >= seq_length:
            for i in range(len(user_data) - seq_length + 1):
                sequence = user_data.iloc[i:i+seq_length]
                sequences.append(sequence[features].values)
    return np.array(sequences)

# 시계열 데이터로 변환
X_train = create_sequences(train)
X_val = create_sequences(val)
X_test = create_sequences(test)


In [17]:
y_train = train.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values
y_val = val.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values
y_test = test.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values

In [20]:
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

In [23]:
y_train = np.concatenate(train.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values)
y_val = np.concatenate(val.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values)
y_test = np.concatenate(test.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values)
y_train

array([1., 1., 1., ..., 0., 0., 1.])

In [8]:
y_train = np.array(train.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values)
y_val = np.array(val.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values)
y_test = np.array(test.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values)

In [24]:


# # 레이블 생성
# seq_length = 19
# y_train = train.groupby('learnerID').apply(lambda x: x['answerCode'].values[seq_length-1:])
# y_val = val.groupby('learnerID').apply(lambda x: x['answerCode'].values[seq_length-1:])
# y_test = test.groupby('learnerID').apply(lambda x: x['answerCode'].values[seq_length-1:])

# 모델 구성
model = Sequential()
model.add(LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1, activation='sigmoid'))

# 모델 컴파일
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 모델 훈련
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# 검증 세트에서의 정확도 측정
y_pred = (model.predict(X_val) > 0.5).astype("int32")
print('검증 정확도:', accuracy_score(y_val, y_pred))

ValueError: Data cardinality is ambiguous:
  x sizes: 2140886
  y sizes: 2420570
Make sure all arrays contain the same number of samples.

In [26]:
X_train

array([[[ 6.00140010e+07, -1.47920384e+00,  2.69308069e+00, ...,
          7.22900000e+03,  0.00000000e+00,  6.00000000e+00],
        [ 6.00140020e+07, -1.34746090e+00,  4.06345870e+00, ...,
          7.22900000e+03,  0.00000000e+00,  6.00000000e+00],
        [ 6.00140030e+07,  5.66274962e-01,  1.83421353e+00, ...,
          7.22900000e+03,  0.00000000e+00,  6.00000000e+00],
        ...,
        [ 6.00270020e+07, -1.22201736e+00,  1.31858868e+00, ...,
          5.89000000e+02,  0.00000000e+00,  6.00000000e+00],
        [ 6.00270030e+07,  5.55053590e-01,  7.59456477e-01, ...,
          5.89000000e+02,  0.00000000e+00,  6.00000000e+00],
        [ 6.00270040e+07, -3.67800997e-01,  1.39170550e+00, ...,
          5.89000000e+02,  0.00000000e+00,  6.00000000e+00]],

       [[ 6.00140020e+07, -1.34746090e+00,  4.06345870e+00, ...,
          7.22900000e+03,  0.00000000e+00,  6.00000000e+00],
        [ 6.00140030e+07,  5.66274962e-01,  1.83421353e+00, ...,
          7.22900000e+03,  0.00000000e

In [7]:
y_train

array([list([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing.sequence import pad_sequences

data = pd.read_csv('/content/drive/MyDrive/data.csv')

# 데이터 전처리
data['learnerID'] = data['learnerID'].str[-9:]
data['assessmentItemID'] = data['assessmentItemID'].str[-9:]
data['learnerID'] = data['learnerID'].astype(int)
data['assessmentItemID'] = data['assessmentItemID'].astype(int)
data = data.drop(columns=['theta'])
data.replace({'gender':{'F' : 1, 'M' : 0}}, inplace = True)
data = data[data.groupby('learnerID')['learnerID'].transform('count') >= 20]




# 시간 순서 정렬 후, train, validation, test 셋 분리
train, test = train_test_split(data, train_size=0.80, test_size=0.20, shuffle=False)
train, val = train_test_split(train, train_size=0.80, test_size=0.20, shuffle=False)

# 데이터를 시계열(sequence) 형태로 변환
def create_sequences(df, seq_length=10):
    features = df.drop(columns=['learnerID', 'answerCode']).columns
    sequences = []
    for user_id in df['learnerID'].unique():
        user_data = df[df['learnerID'] == user_id]
        if len(user_data) >= seq_length:
            for i in range(len(user_data) - seq_length + 1):
                sequence = user_data.iloc[i:i+seq_length]
                sequences.append(sequence[features].values)
    return np.array(sequences)

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (1466815, 10, 7)
y_train shape: (12848,)
X_val shape: (306756, 10, 7)
y_val shape: (10219,)
