In [3]:
# Install the Libraries

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [4]:
# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
misconception_mapping_df = pd.read_csv('misconception_mapping.csv')

In [5]:
# Add context to each question-answer pair for train data
train_df['QA_A'] = train_df['ConstructName'] + " " + train_df['SubjectName'] + " " + train_df['QuestionText'] + " " + train_df['AnswerAText']
train_df['QA_B'] = train_df['ConstructName'] + " " + train_df['SubjectName'] + " " + train_df['QuestionText'] + " " + train_df['AnswerBText']
train_df['QA_C'] = train_df['ConstructName'] + " " + train_df['SubjectName'] + " " + train_df['QuestionText'] + " " + train_df['AnswerCText']
train_df['QA_D'] = train_df['ConstructName'] + " " + train_df['SubjectName'] + " " + train_df['QuestionText'] + " " + train_df['AnswerDText']

# Stack question-answer pairs into a single DataFrame with corresponding misconception IDs
qa_pairs = pd.DataFrame({
    'QA_Text': pd.concat([train_df['QA_A'], train_df['QA_B'], train_df['QA_C'], train_df['QA_D']], axis=0),
    'MisconceptionId': pd.concat([train_df['MisconceptionAId'], train_df['MisconceptionBId'], train_df['MisconceptionCId'], train_df['MisconceptionDId']], axis=0),
}).dropna()

qa_pairs['MisconceptionId'] = qa_pairs['MisconceptionId'].astype(int)


In [6]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_text = tfidf_vectorizer.fit_transform(qa_pairs['QA_Text'])

In [7]:
# Function to calculate MAP@K (K=25 in this case)
def map_at_k(y_true, y_pred, k=25):
    """Compute Mean Average Precision at K for each sample in y_true and y_pred."""
    average_precisions = []
    for true, pred in zip(y_true, y_pred):
        relevance = np.isin(pred[:k], [true])
        precision_at_k = [np.mean(relevance[:i+1]) for i in range(len(relevance)) if relevance[i]]
        if precision_at_k:
            average_precisions.append(np.mean(precision_at_k))
        else:
            average_precisions.append(0)
    return np.mean(average_precisions)

In [8]:
# Function to perform LSA with a specified number of components and calculate MAP@25
def evaluate_lsa(n_components):
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    X_reduced = svd.fit_transform(X_text)

    # Train-validation split for parameter tuning
    X_train, X_val, y_train, y_val = train_test_split(X_reduced, qa_pairs['MisconceptionId'], test_size=0.2, random_state=42)

    # Cosine similarity for validation
    cosine_sim_matrix = cosine_similarity(X_val, X_train)
    top_25_preds = np.argsort(cosine_sim_matrix, axis=1)[:, -25:][:, ::-1]

    # Gather the top 25 predictions for each validation sample
    y_pred_top_25 = [[y_train.iloc[i] for i in indices] for indices in top_25_preds]

    # Calculate MAP@25 score for this number of components
    map25_score = map_at_k(y_val.values, y_pred_top_25)
    return map25_score

In [9]:
# Tuning n_components
components_range = [50, 100, 200, 300]
best_score = 0
best_components = 0

for n in components_range:
    map25_score = evaluate_lsa(n)
    print(f"MAP@25 Score with {n} components: {map25_score}")
    if map25_score > best_score:
        best_score = map25_score
        best_components = n

print(f"Best n_components: {best_components} with MAP@25 Score: {best_score}")

MAP@25 Score with 50 components: 0.32204467450380125
MAP@25 Score with 100 components: 0.3176076813186543
MAP@25 Score with 200 components: 0.3163163148694109
MAP@25 Score with 300 components: 0.3193932327855233
Best n_components: 50 with MAP@25 Score: 0.32204467450380125


In [10]:
# Apply best n_components on the entire dataset for final predictions
svd = TruncatedSVD(n_components=best_components, random_state=42)
X_reduced = svd.fit_transform(tfidf_vectorizer.fit_transform(qa_pairs['QA_Text']))

# Recompute the misconception LSA embeddings
misconception_tfidf = tfidf_vectorizer.transform(misconception_mapping_df['MisconceptionName'])
misconception_lsa = svd.transform(misconception_tfidf)  # Recompute misconception_lsa

In [11]:
# Prepare the test data with context
test_df['QA_A'] = test_df['ConstructName'] + " " + test_df['SubjectName'] + " " + test_df['QuestionText'] + " " + test_df['AnswerAText']
test_df['QA_B'] = test_df['ConstructName'] + " " + test_df['SubjectName'] + " " + test_df['QuestionText'] + " " + test_df['AnswerBText']
test_df['QA_C'] = test_df['ConstructName'] + " " + test_df['SubjectName'] + " " + test_df['QuestionText'] + " " + test_df['AnswerCText']
test_df['QA_D'] = test_df['ConstructName'] + " " + test_df['SubjectName'] + " " + test_df['QuestionText'] + " " + test_df['AnswerDText']

# Stack test question-answer pairs, excluding the correct answer
qa_pairs = []
for _, row in test_df.iterrows():
    correct_answer = row['CorrectAnswer']
    for answer in ['A', 'B', 'C', 'D']:
        if answer != correct_answer:
            qa_pairs.append({
                'QuestionId_Answer': f"{row['QuestionId']}_{answer}",
                'QA_Text': row[f'QA_{answer}']
            })

test_qa_pairs = pd.DataFrame(qa_pairs)

In [12]:
# Transform the test data
test_tfidf = tfidf_vectorizer.transform(test_qa_pairs['QA_Text'])
test_lsa = svd.transform(test_tfidf)

# Cosine similarity between test QA pairs and misconceptions
similarity_matrix_test = cosine_similarity(test_lsa, misconception_lsa)
top_k = 25
top_k_indices_test = np.argsort(similarity_matrix_test, axis=1)[:, -top_k:][:, ::-1]

# Format the predictions for submission
submission = pd.DataFrame({
    'QuestionId_Answer': test_qa_pairs['QuestionId_Answer'],
    'MisconceptionId': [' '.join(map(str, preds)) for preds in top_k_indices_test]
})

# Save the submission file
submission.to_csv("submission.csv", index=False)
print("Submission file created successfully!")

Submission file created successfully!


In [13]:
data  = pd.read_csv('submission.csv')
with pd.option_context('display.max_rows', None):
    print(data)

  QuestionId_Answer                                    MisconceptionId
0            1869_B  842 2532 987 657 2518 1999 1338 1929 1672 1941...
1            1869_C  842 2532 987 657 2518 1999 1338 1929 1672 1941...
2            1869_D  842 657 1005 2532 987 2488 1999 2518 1338 1929...
3            1870_A  979 1540 885 363 29 1825 623 1928 1305 112 80 ...
4            1870_B  979 1540 885 363 29 1825 623 1928 1305 112 80 ...
5            1870_C  979 1540 885 363 29 1825 623 1928 1305 112 80 ...
6            1871_A  632 549 1200 2211 1059 2551 2471 2439 2243 192...
7            1871_C  632 1200 549 2211 1059 2551 2439 2471 2243 192...
8            1871_D  632 549 1200 2211 1059 2551 2471 2439 2243 192...
