In [1]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2024-12-12 03:46:21--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-12-12 03:46:22--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-12-12 03:46:22--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [2]:
# Download FastText embeddings
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz

# Extract the compressed file
!gunzip cc.en.300.vec.gz

print("FastText embeddings downloaded and extracted successfully!")

--2024-12-12 03:49:32--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.169.121.107, 3.169.121.57, 3.169.121.110, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.169.121.107|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz’


2024-12-12 03:49:44 (107 MB/s) - ‘cc.en.300.vec.gz’ saved [1325960915/1325960915]

FastText embeddings downloaded and extracted successfully!


In [3]:
!kaggle datasets download leadbest/googlenewsvectorsnegative300
!unzip -q googlenewsvectorsnegative300.zip

Dataset URL: https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300
License(s): other
Downloading googlenewsvectorsnegative300.zip to /content
100% 3.17G/3.17G [02:19<00:00, 25.7MB/s]
100% 3.17G/3.17G [02:19<00:00, 24.3MB/s]


In [1]:
# install the libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [None]:
# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
misconception_mapping_df = pd.read_csv('misconception_mapping.csv')

In [None]:
# Add context to each question-answer pair for train data
for answer in ['A', 'B', 'C', 'D']:
    train_df[f'QA_{answer}'] = train_df['ConstructName'] + " " + train_df['SubjectName'] + " " + train_df['QuestionText'] + " " + train_df[f'Answer{answer}Text']

# Stack question-answer pairs into a single DataFrame with corresponding misconception IDs
qa_pairs = pd.DataFrame({
    'QA_Text': pd.concat([train_df[f'QA_{answer}'] for answer in ['A', 'B', 'C', 'D']], axis=0),
    'MisconceptionId': pd.concat([train_df[f'Misconception{answer}Id'] for answer in ['A', 'B', 'C', 'D']], axis=0),
}).dropna()
qa_pairs['MisconceptionId'] = qa_pairs['MisconceptionId'].astype(int)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))
tfidf_features = tfidf_vectorizer.fit_transform(qa_pairs['QA_Text'])

In [None]:
# Load pre-trained embeddings
# Convert GloVe to Word2Vec format
glove_input_file = "glove.6B.100d.txt"
word2vec_output_file = "glove.6B.100d.word2vec"
glove2word2vec(glove_input_file, word2vec_output_file)
glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
fasttext_model = KeyedVectors.load_word2vec_format("cc.en.300.vec", binary=False)
word2vec_model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
# Function to compute sentence embeddings by averaging word vectors
def sentence_to_embedding(sentence, embedding_model, embedding_dim=100):
    words = sentence.split()
    vectors = [embedding_model[word] for word in words if word in embedding_model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(embedding_dim)

# Generate embeddings
embeddings = {
    "GloVe": np.array([sentence_to_embedding(text, glove_model, 100) for text in qa_pairs['QA_Text']]),
    "FastText": np.array([sentence_to_embedding(text, fasttext_model, 300) for text in qa_pairs['QA_Text']]),
    "Word2Vec": np.array([sentence_to_embedding(text, word2vec_model, 300) for text in qa_pairs['QA_Text']]),
}


# Function to calculate MAP@K
def map_at_k(y_true, y_pred, k=25):
    average_precisions = []
    for true, pred in zip(y_true, y_pred):
        relevance = np.isin(pred[:k], [true])
        precision_at_k = [np.mean(relevance[:i+1]) for i in range(len(relevance)) if relevance[i]]
        if precision_at_k:
            average_precisions.append(np.mean(precision_at_k))
        else:
            average_precisions.append(0)
    return np.mean(average_precisions)

In [None]:
# Evaluate MAP@25 for each embedding
embedding_scores = {}
for name, embedding in embeddings.items():
    single_embedding_features = np.hstack([tfidf_features.toarray(), embedding])
    single_lsa_features = TruncatedSVD(n_components=100, random_state=42).fit_transform(single_embedding_features)
    X_train_single, X_val_single, y_train_single, y_val_single = train_test_split(single_lsa_features, qa_pairs['MisconceptionId'], test_size=0.2, random_state=42)
    cosine_sim_matrix_single = cosine_similarity(X_val_single, X_train_single)
    top_25_preds_single = np.argsort(cosine_sim_matrix_single, axis=1)[:, -25:][:, ::-1]
    embedding_scores[name] = map_at_k(
        y_val_single.values,
        [[y_train_single.iloc[i] for i in indices] for indices in top_25_preds_single]
    )

for name, score in embedding_scores.items():
    print(f"MAP@25 Score with {name} Embedding: {score}")

# Combine embeddings
combined_embeddings = np.hstack([embeddings["GloVe"], embeddings["FastText"], embeddings["Word2Vec"]])

# Combine with TF-IDF features
combined_features = np.hstack([tfidf_features.toarray(), combined_embeddings])

# Dimensionality reduction with SVD
svd = TruncatedSVD(n_components=100, random_state=42)
lsa_features = svd.fit_transform(combined_features)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(lsa_features, qa_pairs['MisconceptionId'], test_size=0.2, random_state=42)

# Compute cosine similarity for validation set
cosine_sim_matrix = cosine_similarity(X_val, X_train)
top_25_preds = np.argsort(cosine_sim_matrix, axis=1)[:, -25:][:, ::-1]

In [6]:
# Evaluate MAP@25
map25_score = map_at_k(
    y_val.values,
    [[y_train.iloc[i] for i in indices] for indices in top_25_preds]
)
print(f"MAP@25 Score with Combined Embeddings: {map25_score}")

# Preprocess test data
for answer in ['A', 'B', 'C', 'D']:
    test_df[f'QA_{answer}'] = test_df['ConstructName'] + " " + test_df['SubjectName'] + " " + test_df['QuestionText'] + " " + test_df[f'Answer{answer}Text']

# Stack test question-answer pairs, excluding the correct answer
test_qa_pairs = []
for _, row in test_df.iterrows():
    correct_answer = row['CorrectAnswer']
    for answer in ['A', 'B', 'C', 'D']:
        if answer != correct_answer:
            test_qa_pairs.append({
                'QuestionId_Answer': f"{row['QuestionId']}_{answer}",
                'QA_Text': row[f'QA_{answer}']
            })

test_qa_pairs = pd.DataFrame(test_qa_pairs)

# Generate embeddings for test data
test_embeddings = {
    "GloVe": np.array([sentence_to_embedding(text, glove_model, 100) for text in test_qa_pairs['QA_Text']]),
    "FastText": np.array([sentence_to_embedding(text, fasttext_model, 300) for text in test_qa_pairs['QA_Text']]),
    "Word2Vec": np.array([sentence_to_embedding(text, word2vec_model, 300) for text in test_qa_pairs['QA_Text']]),
}
combined_test_embeddings = np.hstack([test_embeddings["GloVe"], test_embeddings["FastText"], test_embeddings["Word2Vec"]])

# Combine with TF-IDF features for test data
test_tfidf = tfidf_vectorizer.transform(test_qa_pairs['QA_Text'])
test_combined_features = np.hstack([test_tfidf.toarray(), combined_test_embeddings])

# Dimensionality reduction for test data
test_lsa_features = svd.transform(test_combined_features)

# Compute cosine similarity between test and training data
test_cosine_sim_matrix = cosine_similarity(test_lsa_features, X_train)
top_k_test_preds = np.argsort(test_cosine_sim_matrix, axis=1)[:, -25:][:, ::-1]

# Format predictions for submission
submission = pd.DataFrame({
    'QuestionId_Answer': test_qa_pairs['QuestionId_Answer'],
    'MisconceptionId': [' '.join(map(str, preds)) for preds in top_k_test_preds]
})

# Save the submission file
submission.to_csv("submission_combined_embeddings.csv", index=False)
print("Submission file created successfully as submission_combined_embeddings.csv!")

  glove2word2vec(glove_input_file, word2vec_output_file)


MAP@25 Score with GloVe Embedding: 0.3328175587464839
MAP@25 Score with FastText Embedding: 0.32924964639166243
MAP@25 Score with Word2Vec Embedding: 0.32456491861729064
MAP@25 Score with Combined Embeddings: 0.335783479062598
Submission file created successfully as submission_combined_embeddings.csv!


In [7]:
data  = pd.read_csv('submission_combined_embeddings.csv')
with pd.option_context('display.max_rows', None):
    print(data)

  QuestionId_Answer                                    MisconceptionId
0            1869_B  205 3419 1462 3044 2532 413 3246 1375 2944 235...
1            1869_C  205 3419 1462 3044 2532 413 3246 1375 2944 235...
2            1869_D  205 1462 1375 3044 413 2532 3419 2944 3246 235...
3            1870_A  533 422 585 1029 368 1668 1250 385 34 1567 277...
4            1870_B  533 422 585 1029 368 1668 1250 385 34 1567 277...
5            1870_C  1668 422 533 585 1029 368 1250 385 34 1567 226...
6            1871_A  1623 2185 3307 2313 3468 3326 1286 2896 616 14...
7            1871_C  1623 2185 3307 2313 3468 3326 1286 142 616 289...
8            1871_D  2185 1623 3307 2313 3468 3326 1286 2896 750 10...
