In [None]:
import pandas as pd

In [None]:
bloomz = pd.read_csv('bloomz_results.csv')
flan = pd.read_csv('flan_results.csv')
gptneo = pd.read_csv('neo_results.csv')
gpt2 = pd.read_csv('gpt_results.csv')
opt = pd.read_csv('opt_results.csv', on_bad_lines='skip')

In [None]:
bloomz_labels = bloomz['label']
bloomz_labels.unique()

array(['public health', 'social issues', 'domestic policy',
       'environmental policy', 'foreign policy', 'economy and taxation',
       'immigration', 'education policy'], dtype=object)

In [None]:
len(bloomz), len(flan), len(gptneo), len(gpt2), len(opt)

(100, 100, 100, 100, 99)

In [None]:
bloomz_answers = bloomz['answer']
flan_answers = flan['answer']
gptneo_answers = gptneo['answer']
gpt2_answers = gpt2['answer']
opt_answers = opt['answer']

In [None]:
len(bloomz_answers), len(flan_answers), len(gptneo_answers), len(gpt2_answers), len(opt_answers)

(100, 100, 100, 100, 99)

In [None]:
train_data = pd.read_csv('train.csv')
train_answers = train_data['text']

In [None]:
train_data['party'].value_counts()

Unnamed: 0_level_0,count
party,Unnamed: 1_level_1
democrat,95677
republican,94223
neutral,3429
independent,142


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt_tab')

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def avg_answer_length(answers):
  return sum(len(answer) for answer in answers) / len(answers)

avg_lengths = {
    'bloomz': avg_answer_length(bloomz_answers),
    'flan': avg_answer_length(flan_answers),
    'gptneo': avg_answer_length(gptneo_answers),
    'gpt2': avg_answer_length(gpt2_answers),
    'opt': avg_answer_length(opt_answers)
}

In [None]:
def avg_tokens_per_answer(answers):
    total_tokens = 0
    for answer in answers:
        tokens = word_tokenize(answer)
        total_tokens += len(tokens)
    return total_tokens / len(answers)

avg_tokens = {
    'bloomz': avg_tokens_per_answer(bloomz_answers),
    'flan': avg_tokens_per_answer(flan_answers),
    'gptneo': avg_tokens_per_answer(gptneo_answers),
    'gpt2': avg_tokens_per_answer(gpt2_answers),
    'opt': avg_tokens_per_answer(opt_answers)
}

In [None]:
models = list(avg_lengths.keys())
lengths = list(avg_lengths.values())
tokens = list(avg_tokens.values())

In [None]:
plt.figure(figsize=(10, 5))
plt.bar(models, lengths, color=['skyblue', 'salmon', 'lightgreen', 'lightcoral', 'khaki'])
plt.xlabel("Models")
plt.ylabel("Average Answer Length")
plt.title("Comparison of Average Answer Lengths")
plt.show()

plt.figure(figsize=(10, 5))
plt.bar(models, tokens, color=['skyblue', 'salmon', 'lightgreen', 'lightcoral', 'khaki'])
plt.xlabel("Models")
plt.ylabel("Average Tokens per Answer")
plt.title("Comparison of Average Tokens per Answer")
plt.show()

In [None]:
all_answers = bloomz_answers.tolist() + flan_answers.tolist() + gptneo_answers.tolist() + gpt2_answers.tolist() + opt_answers.tolist()
text = " ".join(map(str, all_answers))

In [None]:
stop_words = set(stopwords.words('english'))
additional_stopwords = {"like", "said", "would", "could", "also", "get", "one", "two", "three", "many", "much", "use", "using", "know", "people", "thing", "things", "way", "ways", "make", "made", "even", "still", "may", "might", "every", "really", "lot", "go", "going", "see", "seen", "think", "thought", "say", "says", "saying", "something", "someone", "time", "times", "look", "looked", "new", "give", "given", "right", "wrong", "take", "taken", "another", "different", "example", "question", "answer", "answers", "comprehensive"} # Add more comprehensive stopwords
stop_words.update(additional_stopwords)

In [None]:
wordcloud = WordCloud(width=800, height=400,
                      background_color='white',
                      stopwords=stop_words,
                      min_word_length=3, 
                      min_font_size=10).generate(text)

In [None]:
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.title("Wordcloud for All Models")
plt.show()