In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/data.csv')

In [None]:
data.head()

Unnamed: 0,learnerID,assessmentItemID,answerCode,theta,difficultyLevel,discriminationLevel,guessLevel,knowledgeTag,gender,grade
0,A060000034,A060014001,1,0.574639,-1.479204,2.693081,3.4e-09,7229,M,6
1,A060000034,A060014002,1,0.574639,-1.347461,4.063459,3.62e-08,7229,M,6
2,A060000034,A060014003,1,0.574639,0.566275,1.834214,9.699e-07,7229,M,6
3,A060000034,A060014004,1,0.574639,-0.361558,1.263308,8.77e-08,7229,M,6
4,A060000034,A060014005,0,0.574639,0.955883,2.052489,1.7e-08,7229,M,6


### **DATA**
[시간 순 정렬된 데이터]
- learnerID : 학생 ID
- assessmentItemID : 문제 ID
- answerCode : 채점결과 (0:틀림, 1:맞음)
- theta : testID 에 대한 응시자의 능력 수준 (-5~5)
- difficultyLevel : 문제 난이도 (-5~5)
- discriminationLevel : 변별도 (0~∞)
- guessLevel : 추측도 (0~1)
- knowledgeTag : 지식체계번호
- gender : 성별 (M/F)
- grade : 학년 (1~9)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing.sequence import pad_sequences

data = pd.read_csv('/content/drive/MyDrive/data.csv')

# 데이터 전처리
data['learnerID'] = data['learnerID'].str[-9:]
data['assessmentItemID'] = data['assessmentItemID'].str[-9:]
data['learnerID'] = data['learnerID'].astype(int)
data['assessmentItemID'] = data['assessmentItemID'].astype(int)
data = data.drop(columns=['theta'])
data.replace({'gender':{'F' : 1, 'M' : 0}}, inplace = True)
data = data[data.groupby('learnerID')['learnerID'].transform('count') >= 20]

data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)        # 엔터 제거


# 시간 순서 정렬 후, train, validation, test 셋 분리
# 각 사용자의 마지막 행을 기준으로 테스트 세트 구성
test = data.groupby('learnerID').tail(2)
test1 = data.groupby('learnerID').tail(1)

# 테스트 세트를 제외한 데이터로부터 검증 세트 구성
data2 = data.drop(test1.index)
val = data2.groupby('learnerID').tail(2)
val1 = data2.groupby('learnerID').tail(1)
# 검증 세트와 테스트 세트를 제외한 데이터가 훈련 세트가 됨
train = data2.drop(val1.index)


In [None]:
# 시계열(sequence) 형태로 변환하는 함수
def create_sequences(df, seq_length=3):
    features = df.drop(columns=['learnerID', 'answerCode']).columns
    sequences = []
    for user_id in df['learnerID'].unique():
        user_data = df[df['learnerID'] == user_id]
        if len(user_data) >= seq_length:
            for i in range(len(user_data) - seq_length + 1):
                sequence = user_data.iloc[i:i+seq_length]
                sequences.append(sequence[features].values)
    return np.array(sequences)

# 다음 정오답을 만드는 함수
def create_next_answer(df):
    next_answers = []
    for user_id in df['learnerID'].unique():
        user_data = df[df['learnerID'] == user_id]
        next_answers.extend(user_data['answerCode'].shift(-1).dropna().values)
    return np.array(next_answers)

In [None]:
import numpy as np

def create_sequences(df, seq_length=2):
    features = df.drop(columns=['learnerID', 'answerCode']).columns
    sequences = []
    targets = []

    for user_id in df['learnerID'].unique():
        user_data = df[df['learnerID'] == user_id]
        num_sequences = len(user_data) - seq_length + 1

        if num_sequences > 0:
            for i in range(num_sequences):
                sequence = user_data.iloc[i:i+seq_length][features].values.tolist()
                target = user_data.iloc[i+seq_length-1]['answerCode']  # 마지막 항목의 answerCode를 타겟으로 설정
                sequences.append(sequence)
                targets.append(target)

    return np.array(sequences), np.array(targets)

In [None]:
def create_X(df):
    X_train = []
    for user_id in df['learnerID'].unique():
        user_data = df[df['learnerID'] == user_id]
        sequence = user_data[['assessmentItemID', 'difficultyLevel', 'discriminationLevel',
                              'guessLevel', 'knowledgeTag', 'gender', 'grade']].values.tolist()
        X_train.append(sequence)
    return X_train

def create_y(df):
    y_train = []
    for user_id in df['learnerID'].unique():
        user_data = df[df['learnerID'] == user_id]
        target = user_data['answerCode'].tolist()[-1]  # 각 시퀀스의 마지막 항목의 'answerCode'를 다음 값으로 설정
        y_train.append(target)
    return y_train



In [None]:
X_train = create_X(train)
y_train = create_y(train)

X_val = create_X(val)
y_val = create_y(val)

X_test = create_X(test)
y_test = create_y(test)

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

AttributeError: 'list' object has no attribute 'shape'

In [None]:
import numpy as np

# 패딩된 시퀀스를 NumPy 배열로 변환
X_train_padded = np.array([np.array(seq) for seq in X_train])



ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (16452,) + inhomogeneous part.

In [None]:
import numpy as np

X_train = np.array(X_train)
y_train = np.array(y_train)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (16452,) + inhomogeneous part.

In [None]:
for i, sequence in enumerate(X_train):
    print(f"Sequence {i+1}의 길이: {len(sequence)}")

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Sequence 11453의 길이: 22
Sequence 11454의 길이: 35
Sequence 11455의 길이: 39
Sequence 11456의 길이: 23
Sequence 11457의 길이: 29
Sequence 11458의 길이: 29
Sequence 11459의 길이: 29
Sequence 11460의 길이: 58
Sequence 11461의 길이: 78
Sequence 11462의 길이: 28
Sequence 11463의 길이: 31
Sequence 11464의 길이: 23
Sequence 11465의 길이: 63
Sequence 11466의 길이: 139
Sequence 11467의 길이: 179
Sequence 11468의 길이: 37
Sequence 11469의 길이: 20
Sequence 11470의 길이: 211
Sequence 11471의 길이: 98
Sequence 11472의 길이: 127
Sequence 11473의 길이: 108
Sequence 11474의 길이: 83
Sequence 11475의 길이: 77
Sequence 11476의 길이: 21
Sequence 11477의 길이: 92
Sequence 11478의 길이: 81
Sequence 11479의 길이: 59
Sequence 11480의 길이: 19
Sequence 11481의 길이: 49
Sequence 11482의 길이: 87
Sequence 11483의 길이: 40
Sequence 11484의 길이: 45
Sequence 11485의 길이: 88
Sequence 11486의 길이: 112
Sequence 11487의 길이: 54
Sequence 11488의 길이: 38
Sequence 11489의 길이: 18
Sequence 11490의 길이: 54
Sequence 11491의 길이: 75
Sequence 11492의 길이: 23
Sequence 11493의 길이: 22
S

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

# RNN 모델 구성
model = Sequential([
    SimpleRNN(64, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dense(1, activation='sigmoid')  # 출력층
])

# 모델 컴파일
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 모델 훈련
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))
이

AttributeError: 'list' object has no attribute 'shape'

In [None]:

# 데이터를 시계열(sequence) 형태로 변환
def create_sequences(df, seq_length=19):
    features = df.drop(columns=['learnerID', 'answerCode']).columns
    sequences = []
    for user_id in df['learnerID'].unique():
        user_data = df[df['learnerID'] == user_id]
        if len(user_data) >= seq_length:
            for i in range(len(user_data) - seq_length + 1):
                sequence = user_data.iloc[i:i+seq_length]
                sequences.append(sequence[features].values)
    return np.array(sequences)

# 시계열 데이터로 변환
X_train = create_sequences(train)
X_val = create_sequences(val)
X_test = create_sequences(test)


In [None]:
y_train = train.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values
y_val = val.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values
y_test = test.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values

In [None]:
y_train = np.concatenate(train.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values)
y_val = np.concatenate(val.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values)
y_test = np.concatenate(test.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values)
y_train

array([1., 1., 1., ..., 0., 0., 1.])

In [None]:
y_train = np.array(train.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values)
y_val = np.array(val.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values)
y_test = np.array(test.groupby('learnerID')['answerCode'].apply(lambda x: list(x.shift(-1).dropna())).values)

In [None]:
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

In [None]:


# # 레이블 생성
# seq_length = 19
# y_train = train.groupby('learnerID').apply(lambda x: x['answerCode'].values[seq_length-1:])
# y_val = val.groupby('learnerID').apply(lambda x: x['answerCode'].values[seq_length-1:])
# y_test = test.groupby('learnerID').apply(lambda x: x['answerCode'].values[seq_length-1:])

# 모델 구성
model = Sequential()
model.add(LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1, activation='sigmoid'))

# 모델 컴파일
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 모델 훈련
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# 검증 세트에서의 정확도 측정
y_pred = (model.predict(X_val) > 0.5).astype("int32")
print('검증 정확도:', accuracy_score(y_val, y_pred))

ValueError: Data cardinality is ambiguous:
  x sizes: 2140886
  y sizes: 2420570
Make sure all arrays contain the same number of samples.

In [None]:
X_train

array([[[ 6.00140010e+07, -1.47920384e+00,  2.69308069e+00, ...,
          7.22900000e+03,  0.00000000e+00,  6.00000000e+00],
        [ 6.00140020e+07, -1.34746090e+00,  4.06345870e+00, ...,
          7.22900000e+03,  0.00000000e+00,  6.00000000e+00],
        [ 6.00140030e+07,  5.66274962e-01,  1.83421353e+00, ...,
          7.22900000e+03,  0.00000000e+00,  6.00000000e+00],
        ...,
        [ 6.00270020e+07, -1.22201736e+00,  1.31858868e+00, ...,
          5.89000000e+02,  0.00000000e+00,  6.00000000e+00],
        [ 6.00270030e+07,  5.55053590e-01,  7.59456477e-01, ...,
          5.89000000e+02,  0.00000000e+00,  6.00000000e+00],
        [ 6.00270040e+07, -3.67800997e-01,  1.39170550e+00, ...,
          5.89000000e+02,  0.00000000e+00,  6.00000000e+00]],

       [[ 6.00140020e+07, -1.34746090e+00,  4.06345870e+00, ...,
          7.22900000e+03,  0.00000000e+00,  6.00000000e+00],
        [ 6.00140030e+07,  5.66274962e-01,  1.83421353e+00, ...,
          7.22900000e+03,  0.00000000e

In [None]:
y_train

array([list([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.preprocessing.sequence import pad_sequences

data = pd.read_csv('/content/drive/MyDrive/data.csv')

# 데이터 전처리
data['learnerID'] = data['learnerID'].str[-9:]
data['assessmentItemID'] = data['assessmentItemID'].str[-9:]
data['learnerID'] = data['learnerID'].astype(int)
data['assessmentItemID'] = data['assessmentItemID'].astype(int)
data = data.drop(columns=['theta'])
data.replace({'gender':{'F' : 1, 'M' : 0}}, inplace = True)
data = data[data.groupby('learnerID')['learnerID'].transform('count') >= 20]




# 시간 순서 정렬 후, train, validation, test 셋 분리
train, test = train_test_split(data, train_size=0.80, test_size=0.20, shuffle=False)
train, val = train_test_split(train, train_size=0.80, test_size=0.20, shuffle=False)

# 데이터를 시계열(sequence) 형태로 변환
def create_sequences(df, seq_length=10):
    features = df.drop(columns=['learnerID', 'answerCode']).columns
    sequences = []
    for user_id in df['learnerID'].unique():
        user_data = df[df['learnerID'] == user_id]
        if len(user_data) >= seq_length:
            for i in range(len(user_data) - seq_length + 1):
                sequence = user_data.iloc[i:i+seq_length]
                sequences.append(sequence[features].values)
    return np.array(sequences)

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (1466815, 10, 7)
y_train shape: (12848,)
X_val shape: (306756, 10, 7)
y_val shape: (10219,)
