In [1]:
# Data
import pandas as pd
import numpy as np

# Math Imports
import math

# Spacy Imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load("en_core_web_sm")

#NLTK Imports
from nltk.stem import PorterStemmer
porter = PorterStemmer()

#string imports
import string
from string import punctuation
import re

# Transformers and Feature Selection
from sklearn.preprocessing import MinMaxScaler


In [2]:
class Create_Features:
    
    def __init__(self, loadpath, savename, savepath):
        self.loadpath = loadpath
        self.savename = savename
        self.savepath = savepath
        self.raw_data = pd.read_csv(
            loadpath
        )
    
    def clean_sentence(self, sentence):
        sentence = sentence.replace(".","").lower().strip()
        sentence = sentence.translate(sentence.maketrans("","", string.punctuation))
        sentence = sentence.replace("  "," ")
        return sentence

    def remove_stopwords(self, doc):
        doc = clean_sentence(doc)
        my_doc = nlp(doc)
        token_list = []
        for token in my_doc:
            token_list.append(token.text)

        filtered_sentence =[] 
        for word in token_list:
            lexeme = nlp.vocab[word]
            if lexeme.is_stop == False:
                filtered_sentence.append(word) 
        return " ".join(filtered_sentence)


    def stem_sentence(self, sentence):
        # Strip punctuaton
        sentence = clean_sentence(sentence)
        #Remove Stop Words
        sentence = remove_stopwords(sentence)
        #Stem or Lem
        sentence = clean_sentence(sentence)
        return " ".join([porter.stem(word) for word in sentence.split(" ")])

    def order_sentence(self, sentence):
        sentence = clean_sentence(sentence)
        sentence = sorted(sentence.split(" "))
        return " ".join(sentence)

    def word_count(self, sentence):
        sentence = sentence.replace(".", "")
        return len(sentence.split(" "))

#     ################### similarity measures ####################
#     def generic_similarity(sentence, answer):
#         tokens = set(sentence.split(" ") + answer.split(" "))
#         if '' in tokens:
#             tokens.remove('')
#         sentence_tokens = {token : "" for token in tokens}
#         count = 0
#         for word in tokens:
#             if word in sentence.split(" "):
#                 count += 1
#         return count / len(tokens)  

#     def jaccard_similarity(student_answer, teacher_answer):
#         a = set(student_answer.split(" "))
#         b = set(teacher_answer.split(" "))
#         c = a.intersection(b)
#         return (len(c) / (len(a) + len(b) - len(c)))


#     def cosine_similarity(sentence, answer):
#         tokens = set(sentence.split(" ") + answer.split(" "))
#         if '' in tokens:
#             tokens.remove('')
#         answer_tokens = {token : "" for token in tokens}
#         sentence_tokens = {token : "" for token in tokens}

#         for word in tokens:
#             if word in answer.split(" "):
#                 answer_tokens[word] = 1
#             else:
#                 answer_tokens[word] = 0

#             if word in sentence.split(" "):
#                 sentence_tokens[word] = 1
#             else:
#                 sentence_tokens[word] = 0

#         dot_prod = 0
#         mag_s = 0
#         mag_a = 0
#         for word in tokens:
#             dot_prod += answer_tokens[word] * sentence_tokens[word]
#             mag_s += sentence_tokens[word] ** 2
#             mag_a += answer_tokens[word] ** 2

#         mag_s = math.sqrt(mag_s)
#         mag_a = math.sqrt(mag_a)
#         if mag_s * mag_a == 0:
#             return 0
#         else:
#             similarity = dot_prod / (mag_s * mag_a)
#             return similarity#round(similarity,4)

#     ################### Entity Extraction ####################
#     def unigram_entity_extraction(df, sentence_col_name, new_col_name, answer):
#         """
#         This breaks the sentence using spaces
#         and then creates features based one each word
#         """
#         answer = answer.replace(".","").lower().strip()
#         answer = answer.translate(answer.maketrans("","", string.punctuation))
#         answer = answer.replace("  "," ")
#         # Break sentence into list
#         answer_list = answer.split(" ")

#         for word in answer_list:
#             #Goes across each row
#             df[f'{new_col_name}_has_{word}'] = df[sentence_col_name].apply(lambda sent: int(word in sent))

#         return df

#     # Bigram and Trigram still broken
#     # Basically, they aren't treating the sentence and answer the same
#     def bigram_entity_extraction(df, sentence_col_name, new_col_name, answer):

#         answer = clean_sentence(answer)
#         # Create list of bigrams for answer
#         bigram_answer = create_list_of_bigrams(answer)

#         for bigram in bigram_answer:
#             bigram_ = bigram.replace(" ", "_")
#             df[f'{new_col_name}_has_{bigram_}'] = df[sentence_col_name].apply(lambda sent: int(bigram in sent))

#         return df

#     def create_list_of_bigrams(sentence):
#         sentence = clean_sentence(sentence)
#         sentence_list = sentence.split(" ")
#         bigram_list = []

#         #For each word in sentence, but needed the index
#         for i in range(len(sentence_list)):
#             # For index out of bounds error prrevention
#             if i < len(sentence_list)-1:
#                 bigram_list.append(f"{sentence_list[i]} {sentence_list[i+1]}")
#     #     print(bigram_list)
#         return bigram_list

#     def trigram_entity_extraction(df, sentence_col_name, new_col_name, answer):
#         answer = clean_sentence(answer)
#         trigram_answer = create_list_of_trigrams(answer)

#         for trigram in trigram_answer:
#             trigram_ = trigram.replace(" ", "_")
#             df[f'{new_col_name}_has_{trigram_}'] = df[sentence_col_name].apply(lambda sent: int(trigram in sent))

#         return df

#     def create_list_of_trigrams(sentence):
#         sentence = clean_sentence(sentence)
#         sentence_list = sentence.split(" ")
#         trigram_list = []

#         #For each word in sentence, but needed the index
#         for i in range(len(sentence_list)):
#             # For index out of bounds error prrevention
#             if i < len(sentence_list)-2:
#                 trigram_list.append(f"{sentence_list[i]} {sentence_list[i+1]} {sentence_list[i+2]}")

#         return trigram_list

#     ################### Transforming ####################
#     def scale_column(sc, count):
#         # Train the scaler on the old data
#         # Use the sc to transform the sentence
#         return sc.transform(np.array([[count]]))[0][0]

#     def save_feature_set(self, df, idx_start, path, filename):
#         left = df.iloc[:, :idx_start]
#         right = df.iloc[:, idx_start:]

#         left.to_csv(
#             path + filename + 'doc.csv',
#             index = False
#         )
#         print("Saved document data")
#         right.to_csv(
#             path + filename + 'data.csv',
#             index = False
#         )
#         print("Saved numerical data")

    def create_features(self, data):
        a_stopwords = sf.remove_stopwords(self.teacher_answer)
        a_stemmed = sf.stem_sentence(a_stopwords)
        a_stopwords_ordered = sf.order_sentence(a_stemmed)
        a_stemmed_ordered = sf.order_sentence(a_stemmed)
        a_lem = sf.lemmatize_sentence(self.teacher_answer)
        a_lem_ordered = sf.order_sentence(a_lem)
        teacher_answers = [
            a_stemmed,
            a_stemmed_ordered
        ]
        
        # Change sentence into multiple versions
        log = dict()
        log['student_answer'] = answer
        log['teacher_answer'] = self.teacher_answer
        log['q_answer'] = answer
        log['q_answer_ordered'] = sf.order_sentence(log['q_answer'])
        log['q_stopwords'] = sf.remove_stopwords(answer)
        log['q_stopwords_ordered'] = sf.order_sentence(log['q_stopwords'])
        log['q_stemmed'] = sf.stem_sentence(answer)
        log['q_stem_ordered'] = sf.order_sentence(log['q_stemmed'])
        log['q_lemm'] = sf.lemmatize_sentence(answer)
        log['q_lemm_ordered'] = sf.order_sentence(log['q_lemm'])
        
        # Might need to save scaling until jsut before modeling
        log['wordcount'] = sf.word_count(answer)
        log['wordcount'] = sf.scale_column(self.word_scaler, log['wordcount'])
        log['sentence_count'] = sf.sentence_count(answer)
        log['sentence_count'] = sf.scale_column(self.sent_scaler, log['sentence_count'])
        #same fix as before


#         Stem sim
        log['stem_g_similarity'] = sf.generic_similarity(log['q_stemmed'], a_stemmed)
        log['stem_j_similarity'] = sf.jaccard_similarity(log['q_stemmed'], a_stemmed)
        log['stem_c_similarity'] = sf.cosine_similarity(log['q_stemmed'], a_stemmed)
        # Ordered
        log['stem_ordered_g_similarity'] =  sf.generic_similarity(log['q_stem_ordered'], a_stemmed_ordered)
        log['stem_ordered_j_similarity'] =  sf.jaccard_similarity(log['q_stem_ordered'], a_stemmed_ordered)
        log['stem_ordered_c_similarity'] =  sf.cosine_similarity(log['q_stem_ordered'], a_stemmed_ordered)

  
        # Appending New Answer
        self.new_answers = self.new_answers.append(log, ignore_index = True)
        
        # Entity Extraction
        types_of_sentences = [
            'q_stemmed',
            'q_stem_ordered',
        ]
        
        for sent_type, teach_ans in zip(types_of_sentences, teacher_answers):
            
            self.new_answers = sf.unigram_entity_extraction(self.new_answers, sent_type, sent_type, teach_ans)
            self.new_answers = sf.bigram_entity_extraction(self.new_answers, sent_type, sent_type, teach_ans)
            self.new_answers = sf.trigram_entity_extraction(self.new_answers, sent_type, sent_type, teach_ans)