# Current Score - 67.03%

In [2]:
import nltk
from nltk.corpus import stopwords
set(stopwords.words('english'))
import pandas as pd
import numpy as np
import spacy
nlp = spacy.load("en_core_web_lg")

In [3]:
df = pd.read_csv(
    "sample_questions_labeled.csv",
)

In [4]:
df.head()

Unnamed: 0,student_answer,teacher_answer,label,question_id
0,By letting it sit in a dish for a day.,"The water was evaporated, leaving the salt.",0,25
1,Let the water evaporate and the salt is left b...,"The water was evaporated, leaving the salt.",1,25
2,The water evaporated and left salt crystals.,"The water was evaporated, leaving the salt.",1,25
3,I saw a pinkish grayish color that was blockin...,"The water was evaporated, leaving the salt.",0,25
4,You have to slowly tip the vial for only the w...,"The water was evaporated, leaving the salt.",0,25


# Feature Engineering
## Jaccard Similarity

In [5]:
# Word Count
df['word_count'] = df.student_answer.apply(lambda x: len(x.split(" ")))
df['string_length'] = df.student_answer.apply(lambda x: len(x))

In [6]:
def jaccard_similarity(student_answer, teacher_answer):
    a = set(student_answer.split(" "))
    b = set(teacher_answer.split(" "))
    c = a.intersection(b)
    return (len(c) / (len(a) + len(b) - len(c)))
df['jaccard_simularity'] = df.apply(lambda row: jaccard_similarity(row.student_answer, row.teacher_answer), axis = 1)

In [7]:
from spacy.lang.en.stop_words import STOP_WORDS
def remove_stopwords(doc):
    my_doc = nlp(doc)
    token_list = []
    for token in my_doc:
        token_list.append(token.text)
    
    filtered_sentence =[] 
    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    return " ".join(filtered_sentence)

In [8]:
def jaccard_similarity(student_answer, teacher_answer):
    student_answer = remove_stopwords(student_answer)
    teacher_answer = remove_stopwords(teacher_answer)
    a = set(student_answer.split(" "))
    b = set(teacher_answer.split(" "))
    c = a.intersection(b)
    return (len(c) / (len(a) + len(b) - len(c)))

df['jaccard_simularity_stop'] = df.apply(lambda row: jaccard_similarity(row.student_answer, row.teacher_answer), axis = 1)

In [9]:
df.head()

Unnamed: 0,student_answer,teacher_answer,label,question_id,word_count,string_length,jaccard_simularity,jaccard_simularity_stop
0,By letting it sit in a dish for a day.,"The water was evaporated, leaving the salt.",0,25,10,38,0.0,0.1
1,Let the water evaporate and the salt is left b...,"The water was evaporated, leaving the salt.",1,25,10,52,0.142857,0.333333
2,The water evaporated and left salt crystals.,"The water was evaporated, leaving the salt.",1,25,7,44,0.166667,0.5
3,I saw a pinkish grayish color that was blockin...,"The water was evaporated, leaving the salt.",0,25,11,58,0.125,0.181818
4,You have to slowly tip the vial for only the w...,"The water was evaporated, leaving the salt.",0,25,13,57,0.125,0.222222


# Spacy Similarity

In [10]:
doc1 = nlp('class cryptography')
doc2 = nlp('This is the answer to jaccard disimilarity')

In [11]:
def spacy_similarity(doc1, doc2):
    student_answer = nlp(doc1)
    teacher_answer = nlp(doc2)
    return student_answer.similarity(teacher_answer)

In [12]:
df['spacy_similarity'] = df.apply(lambda row: spacy_similarity(row.student_answer, row.teacher_answer), axis = 1)

In [13]:
def spacy_similarity(doc1, doc2):
    student_answer = remove_stopwords(doc1)
    teacher_answer = remove_stopwords(doc2)
    student_answer = nlp(student_answer)
    teacher_answer = nlp(teacher_answer)
    return student_answer.similarity(teacher_answer)

In [14]:
df['spacy_similarity_stop'] = df.apply(lambda row: spacy_similarity(row.student_answer, row.teacher_answer), axis = 1)

In [15]:
df.head(10)

Unnamed: 0,student_answer,teacher_answer,label,question_id,word_count,string_length,jaccard_simularity,jaccard_simularity_stop,spacy_similarity,spacy_similarity_stop
0,By letting it sit in a dish for a day.,"The water was evaporated, leaving the salt.",0,25,10,38,0.0,0.1,0.786277,0.677196
1,Let the water evaporate and the salt is left b...,"The water was evaporated, leaving the salt.",1,25,10,52,0.142857,0.333333,0.93386,0.91917
2,The water evaporated and left salt crystals.,"The water was evaporated, leaving the salt.",1,25,7,44,0.166667,0.5,0.928088,0.926139
3,I saw a pinkish grayish color that was blockin...,"The water was evaporated, leaving the salt.",0,25,11,58,0.125,0.181818,0.797666,0.63653
4,You have to slowly tip the vial for only the w...,"The water was evaporated, leaving the salt.",0,25,13,57,0.125,0.222222,0.831477,0.763872
5,"By pouring the water and salt into the thing, ...","The water was evaporated, leaving the salt.",1,25,14,78,0.133333,0.4,0.942,0.928416
6,By slowly pouring it in a tray.,"The water was evaporated, leaving the salt.",0,25,7,31,0.0,0.111111,0.818396,0.700706
7,The water evaporated so there was only salt left.,"The water was evaporated, leaving the salt.",1,25,9,49,0.230769,0.571429,0.957324,0.958581
8,We put the water with the salt in it and put i...,"The water was evaporated, leaving the salt.",1,25,18,83,0.111111,0.375,0.908447,0.913873
9,I separated the salt from the water because th...,"The water was evaporated, leaving the salt.",0,25,26,121,0.1,0.375,0.894988,0.902363


## Cosine Similarity

# TFIDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score

In [17]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df.student_answer)

In [18]:
df1 = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names())
df = pd.concat([df, df1], axis = 1)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4925 entries, 0 to 4924
Columns: 2316 entries, student_answer to pred
dtypes: float64(2309), int32(1), int64(4), object(2)
memory usage: 87.0+ MB


# Clustering Code

In [20]:
from sklearn.cluster import KMeans

In [21]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(df.iloc[:, 6:7].values)
y_pred = model.predict(df.iloc[:, 6:7].values)

In [22]:
df['pred'] = y_pred

In [23]:
df["correct"] = df.apply(lambda row: 1 if row.pred == row.label else 0, axis = 1) # TFIDF creates columns with words similar to label, which makes this fail

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
print(f"{round(df.correct.sum() / len(df) * 100,2)}%")

In [None]:
df.head()

In [None]:
df.info()