# HARSHITA MAHESH HIREMATH

In [1]:
import pandas as pd
import spacy
import numpy as np
from collections import Counter
from itertools import tee
from scipy.stats import wilcoxon
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
def tokenize(doc, lemmatized=False, remove_stopword=False, 
                   remove_punct = True, pos_tag = False):
    
    tokens =[]
    
    # add your code here
    
    # Load the SpaCy English model
    nlp = spacy.load("en_core_web_sm")
    
    # Process the input document
    doc = nlp(doc)
        
    for token in doc:
        # Apply lemmatization if lemmatized is True
        if lemmatized:
            token_text = token.lemma_
        else:
            token_text = token.text
        
        # Remove stop words if remove_stopword is True
        if remove_stopword and token.is_stop:
            continue
        
        # Remove punctuation if remove_punct is True
        if remove_punct and token.is_punct:
            continue
        
        # Lowercase the token and remove empty tokens
        cleaned_token = token_text.lower().strip()
        if cleaned_token:
            # Optionally retrieve the POS tag
            if pos_tag:
                tokens.append((cleaned_token, token.pos_))
            else:
                tokens.append(cleaned_token)            
    return tokens

In [3]:
def pairwise(iterable):
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

def compute_concreteness(doc):
    
    concreteness, articles, adpositions, quantifier = None, None, None, None

    # add your code here

    #Tokenize the document and retrieve POS tags
    tokens = tokenize(doc, lemmatized=False, remove_stopword=False, remove_punct=False, pos_tag=True)
    
    #Generate bigrams
    bigrams = list(pairwise(tokens))
    
    #Find unigrams with tags article or adposition
    articles = [(token,pos_tag) for token, pos_tag in tokens if pos_tag in ["DET", "ADP"]]
    
    #Find adpositions
    adpositions = [(token,pos_tag) for token, pos_tag in tokens if pos_tag == "ADP"]
    
    #Find bigrams where the first word is adjective and the second one is noun
    adj_noun_bigrams = [[(token1,pos_tag1), (token2),pos_tag2] for (token1, pos_tag1), (token2, pos_tag2) in bigrams if pos_tag1 == "ADJ" and pos_tag2 == "NOUN"]
    
    #Compute concreteness score
    non_punct_tokens = [(token,pos_tag) for token, pos_tag in tokens if pos_tag != "PUNCT"]
    concreteness = ((len(articles) + len(adpositions) + 2 * len(adj_noun_bigrams)) / len(non_punct_tokens))
    
    return concreteness, articles, adpositions, [token for token in adj_noun_bigrams]

In [4]:
def answer_quality(gen_tokens, ref_tokens):
    result = None
    
    # add your code here

    precision_scores = []
    recall_scores = []
    
    for gen_answer, ref_answer in zip(gen_tokens, ref_tokens):
        # Generate bigrams from the ChatGPT-generated and human answers
        gen_bigrams = list(zip(gen_answer, gen_answer[1:]))
        ref_bigrams = list(zip(ref_answer, ref_answer[1:]))

        # Compute precision and recall for bigrams
        common_bigrams = set(gen_bigrams) & set(ref_bigrams)
        precision_bigrams = len(common_bigrams) / len(gen_bigrams) if len(gen_bigrams) > 0 else 0
        recall_bigrams = len(common_bigrams) / len(ref_bigrams) if len(ref_bigrams) > 0 else 0

        # Generate unigrams from the ChatGPT-generated and human answers
        gen_unigrams = gen_answer
        ref_unigrams = ref_answer

        # Compute precision and recall for unigrams
        common_unigrams = set(gen_unigrams) & set(ref_unigrams)
        precision_unigrams = len(common_unigrams) / len(gen_unigrams) if len(gen_unigrams) > 0 else 0
        recall_unigrams = len(common_unigrams) / len(ref_unigrams) if len(ref_unigrams) > 0 else 0

        # Average precision and recall for both bigrams and unigrams
        average_precision = (precision_bigrams + precision_unigrams) / 2
        average_recall = (recall_bigrams + recall_unigrams) / 2

        precision_scores.append(average_precision)
        recall_scores.append(average_recall)
        
    precision = pd.DataFrame({'Precision': precision_scores})
    recall = pd.DataFrame({'Recall': recall_scores})
    result = pd.concat([precision,recall],axis=1)

    return result

In [5]:
def compute_tf_idf(tokenized_docs):
    
    smoothed_tf_idf = None
    
    # add your code here

    # Convert tokenized documents back to text
    text_docs = [' '.join(tokens) for tokens in tokenized_docs]

    # Create a TfidfVectorizer to compute TF-IDF
    tfidf_vectorizer = TfidfVectorizer(smooth_idf=True)

    # Fit and transform the vectorizer on the text documents
    tfidf_matrix = tfidf_vectorizer.fit_transform(text_docs)

    # Normalize the TF-IDF matrix
    tfidf_matrix = np.asarray(tfidf_matrix.todense())  # Convert to a dense matrix
    row_sums = np.linalg.norm(tfidf_matrix, axis=1)  # Calculate L2 norm for each row
    smoothed_tf_idf = tfidf_matrix / row_sums[:, np.newaxis]  # Normalize each row by dividing by L2 norm

    return smoothed_tf_idf

In [6]:
def assess_similarity(question_tokens, gen_tokens, ref_tokens):
    
    result = None
    
    # add your code here

    # Combine all tokens into a single list
    all_tokens = question_tokens + gen_tokens + ref_tokens
    nlp = spacy.load("en_core_web_sm")

    # Calculate word embeddings for all tokens
    token_embeddings = [nlp(' '.join(tokens)).vector for tokens in all_tokens]

    # Split the embeddings into sub-lists
    num_questions = len(question_tokens)
    num_gen_answers = len(gen_tokens)

    question_embeddings = token_embeddings[:num_questions]
    gen_answer_embeddings = token_embeddings[num_questions:num_questions + num_gen_answers]
    ref_answer_embeddings = token_embeddings[num_questions + num_gen_answers:]

    # Calculate cosine similarities
    question_gen_similarities = []
    for question_embedding in question_embeddings:
        similarities = [np.dot(question_embedding, gen_embedding) / (np.linalg.norm(question_embedding) * np.linalg.norm(gen_embedding))
                        for gen_embedding in gen_answer_embeddings]
        question_gen_similarities.append(np.mean(similarities))

    question_ref_similarities = []
    for question_embedding in question_embeddings:
        similarities = [np.dot(question_embedding, ref_embedding) / (np.linalg.norm(question_embedding) * np.linalg.norm(ref_embedding))
                        for ref_embedding in ref_answer_embeddings]
        question_ref_similarities.append(np.mean(similarities))

    gen_ref_similarities = [np.dot(gen_embedding, ref_embedding) / (np.linalg.norm(gen_embedding) * np.linalg.norm(ref_embedding))
                           for gen_embedding, ref_embedding in zip(gen_answer_embeddings, ref_answer_embeddings)]

    # Create a DataFrame with similarities
    data = {
        'Question/Generated Answer': question_gen_similarities,
        'Question/Human Answer': question_ref_similarities,
        'Generated Answer/Human Answer': gen_ref_similarities
    }
    result = pd.DataFrame(data)

    return result

In [7]:
import language_tool_python
from textblob import TextBlob

In [8]:
tool = language_tool_python.LanguageTool('en-US')
data = pd.read_csv("qa.csv")

def count_grammar_errors(text):
    matches = tool.check(text)
    return len(matches)

# Analyze human-generated answers and ChatGPT-generated answers
human_answers = data["human_answer"][:5]
chatgpt_answers = data["chatgpt_answer"][:5]

human_errors = [count_grammar_errors(answer) for answer in human_answers]
chatgpt_errors = [count_grammar_errors(answer) for answer in chatgpt_answers]

# Compare grammatical errors
print("Human Errors:", human_errors)
print("ChatGPT Errors:", chatgpt_errors)

Human Errors: [6, 78, 5, 5, 16]
ChatGPT Errors: [1, 2, 1, 2, 5]


In [9]:
nlp = spacy.load("en_core_web_sm")

def analyze_modality(text):
    doc = nlp(text)
    strong_modality_words = ["definitely", "certainly", "surely"]
    weak_modality_words = ["maybe", "possibly", "probably"]
    strong_modality_count = sum(1 for token in doc if token.text in strong_modality_words)
    weak_modality_count = sum(1 for token in doc if token.text in weak_modality_words)
    return strong_modality_count, weak_modality_count

# Analyze human-generated answers and ChatGPT-generated answers
human_answers = data["human_answer"][:5]
chatgpt_answers = data["chatgpt_answer"][:5]

human_modality = [analyze_modality(answer) for answer in human_answers]
chatgpt_modality = [analyze_modality(answer) for answer in chatgpt_answers]

# Compare modality
print("Human Modality:", human_modality)
print("ChatGPT Modality:", chatgpt_modality)

Human Modality: [(0, 0), (0, 0), (0, 0), (0, 0), (1, 0)]
ChatGPT Modality: [(0, 0), (0, 0), (0, 0), (0, 0), (0, 0)]


In [10]:
def analyze_subjectivity(text):
    analysis = TextBlob(text)
    sentiment = analysis.sentiment
    return sentiment.subjectivity

# Analyze human-generated answers and ChatGPT-generated answers
human_answers = data["human_answer"][:5]
chatgpt_answers = data["chatgpt_answer"][:5]

human_subjectivity = [analyze_subjectivity(answer) for answer in human_answers]
chatgpt_subjectivity = [analyze_subjectivity(answer) for answer in chatgpt_answers]

# Compare subjectivity
print("Human Subjectivity:", human_subjectivity)
print("ChatGPT Subjectivity:", chatgpt_subjectivity)

Human Subjectivity: [0.45, 0.3566495066495067, 0.5429824561403509, 0.44000000000000006, 0.4841269841269841]
ChatGPT Subjectivity: [0.6178062678062679, 0.3941176470588235, 0.5264485514485515, 0.3969065656565656, 0.453505291005291]


## Test

In [11]:
if __name__ == "__main__":  
    data = pd.read_csv("qa.csv")
    
    print("First Question\n")
    
    print(data["question"].iloc[0] + "\n")

    print(f"1.lemmatized=False, remove_stopword=False, remove_punct = True,  pos_tag = False:\n \
    {tokenize(data['question'].iloc[0], lemmatized=False, remove_stopword=False, remove_punct = True, pos_tag = False)}\n")

    print(f"2.lemmatized=False, remove_stopword=False, remove_punct = True,  pos_tag = True:\n \
    {tokenize(data['question'].iloc[0], lemmatized=False, remove_stopword=False, remove_punct = True, pos_tag = True)}\n")

    print(f"3.lemmatized=True, remove_stopword=True, remove_punct = True, pos_tag = False:\n \
    {tokenize(data['question'].iloc[0], lemmatized=True, remove_stopword=True, remove_punct = True, pos_tag = False)}\n")

    print(f"4.lemmatized=True, remove_stopword=True, remove_punct = True, pos_tag = True:\n \
    {tokenize(data['question'].iloc[0], lemmatized=True, remove_stopword=True, remove_punct = True, pos_tag = True)}\n")
 
    print("Second Question\n")
    concreteness, articles, adpositions, quantifier = compute_concreteness(data["question"].iloc[1])
    print(f"Question: {data['question'].iloc[1]} \n\nConcreteness: {concreteness :.4f} \n\nArticles:  {articles} \n\nAdpositions: {adpositions} \n\n(ADJ, NOUNS): {quantifier}")
    
    print("Third Question\n")
    gen_tokens = [tokenize(answer, lemmatized=False, remove_stopword=False, remove_punct=False, pos_tag=False) for answer in data['chatgpt_answer'][:5]]
    ref_tokens = [tokenize(answer, lemmatized=False, remove_stopword=False, remove_punct=False, pos_tag=False) for answer in data['human_answer'][:5]]

    result = answer_quality(gen_tokens, ref_tokens)
    print(result.head())
    
    print("Fourth Question\n")
    # Test tfidf generation using questions
    question_tokens = [tokenize(answer, lemmatized=False, remove_stopword=False, remove_punct=True, pos_tag=False) for answer in data['question']]

    # Configuration: lemmatized=False, remove_stopword=False, remove_punct = True, pos_tag = False
    dtm = compute_tf_idf(question_tokens)
    print(f"1.lemmatized=False, remove_stopword=False, remove_punct = True:\n \
    Shape: {dtm.size}\n")

    # Configuration: lemmatized=True, remove_stopword=True, remove_punct = True, pos_tag = False
    question_tokens = [tokenize(answer, lemmatized=True, remove_stopword=True, remove_punct=True, pos_tag=False) for answer in data['question']]
    dtm = compute_tf_idf(question_tokens)
    print(f"2.lemmatized=True, remove_stopword=True, remove_punct = True:\n \
    Shape: {dtm.size}\n")
    
    print("Fifth Question\n")
    # Configuration: lemmatized=False, remove_stopword=False, remove_punct = True
    question_tokens = [tokenize(answer, lemmatized=False, remove_stopword=False, remove_punct=True, pos_tag=False) for answer in data['question'][:5]]
    gen_tokens = [tokenize(answer, lemmatized=False, remove_stopword=False, remove_punct=True, pos_tag=False) for answer in data['chatgpt_answer'][:5]]
    ref_tokens = [tokenize(answer, lemmatized=False, remove_stopword=False, remove_punct=True, pos_tag=False) for answer in data['human_answer'][:5]]

    result = assess_similarity(question_tokens, 
                               gen_tokens, 
                               ref_tokens)
    print(result.head())

    # You need to test other cases
    question_tokens = [tokenize(answer, lemmatized=False, remove_stopword=False, remove_punct=False, pos_tag=False) for answer in data['question'][:5]]
    gen_tokens = [tokenize(answer, lemmatized=False, remove_stopword=False, remove_punct=False, pos_tag=False) for answer in data['chatgpt_answer'][:5]]
    ref_tokens = [tokenize(answer, lemmatized=False, remove_stopword=False, remove_punct=False, pos_tag=False) for answer in data['human_answer'][:5]]

    result = assess_similarity(question_tokens, 
                               gen_tokens, 
                               ref_tokens)
    print(result.head())

    question_tokens = [tokenize(answer, lemmatized=True, remove_stopword=False, remove_punct=False, pos_tag=False) for answer in data['question'][:5]]
    gen_tokens = [tokenize(answer, lemmatized=True, remove_stopword=False, remove_punct=False, pos_tag=False) for answer in data['chatgpt_answer'][:5]]
    ref_tokens = [tokenize(answer, lemmatized=True, remove_stopword=False, remove_punct=False, pos_tag=False) for answer in data['human_answer'][:5]]

    result = assess_similarity(question_tokens, 
                               gen_tokens, 
                               ref_tokens)
    print(result.head())

First Question

What happens if a parking ticket is lost / destroyed before the owner is aware of the ticket , and it goes unpaid ? I 've always been curious . Please explain like I'm five.

1.lemmatized=False, remove_stopword=False, remove_punct = True,  pos_tag = False:
     ['what', 'happens', 'if', 'a', 'parking', 'ticket', 'is', 'lost', 'destroyed', 'before', 'the', 'owner', 'is', 'aware', 'of', 'the', 'ticket', 'and', 'it', 'goes', 'unpaid', 'i', 've', 'always', 'been', 'curious', 'please', 'explain', 'like', 'i', "'m", 'five']

2.lemmatized=False, remove_stopword=False, remove_punct = True,  pos_tag = True:
     [('what', 'PRON'), ('happens', 'VERB'), ('if', 'SCONJ'), ('a', 'DET'), ('parking', 'NOUN'), ('ticket', 'NOUN'), ('is', 'AUX'), ('lost', 'VERB'), ('destroyed', 'VERB'), ('before', 'SCONJ'), ('the', 'DET'), ('owner', 'NOUN'), ('is', 'AUX'), ('aware', 'ADJ'), ('of', 'ADP'), ('the', 'DET'), ('ticket', 'NOUN'), ('and', 'CCONJ'), ('it', 'PRON'), ('goes', 'VERB'), ('unpaid', 'A