In [12]:
import pathlib
from collections import Counter

import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [13]:
chatgpt = pd.read_csv('./file.csv')
chatgpt['text'] = chatgpt['tweets']
chatgpt['label'] = chatgpt['labels'].map({'good': 'positive', 'bad':'negative', 'neutral':'neutral'})
chatgpt

Unnamed: 0.1,Unnamed: 0,tweets,labels,text,label
0,0,ChatGPT: Optimizing Language Models for Dialog...,neutral,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,1,"Try talking with ChatGPT, our new AI system wh...",good,"Try talking with ChatGPT, our new AI system wh...",positive
2,2,ChatGPT: Optimizing Language Models for Dialog...,neutral,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,3,"THRILLED to share that ChatGPT, our new model ...",good,"THRILLED to share that ChatGPT, our new model ...",positive
4,4,"As of 2 minutes ago, @OpenAI released their ne...",bad,"As of 2 minutes ago, @OpenAI released their ne...",negative
...,...,...,...,...,...
219289,219289,Other Software Projects Are Now Trying to Repl...,bad,Other Software Projects Are Now Trying to Repl...,negative
219290,219290,I asked #ChatGPT to write a #NYE Joke for SEOs...,good,I asked #ChatGPT to write a #NYE Joke for SEOs...,positive
219291,219291,chatgpt is being disassembled until it can onl...,bad,chatgpt is being disassembled until it can onl...,negative
219292,219292,2023 predictions by #chatGPT. Nothing really s...,bad,2023 predictions by #chatGPT. Nothing really s...,negative


In [15]:
#TF-IDF representation, min_df=2
#turn airline tweets train data into a vector
vectorizer = TfidfVectorizer(min_df=2, # If a token appears fewer times than this, across all documents, it will be ignored
                              tokenizer=nltk.word_tokenize, # we use the nltk tokenizer
                              stop_words=stopwords.words('english')) # stopwords are removed

train_vectors = vectorizer.fit_transform(chatgpt['text'])

clf = MultinomialNB().fit(train_vectors, chatgpt['label'])



In [16]:
evaluation = pd.read_csv('./test-datasets/sentiment-topic-final-test.tsv', sep='\t')

eval_vectors = vectorizer.transform(evaluation['text'])
y_pred = clf.predict(eval_vectors)

print(classification_report(evaluation['sentiment'], y_pred, digits = 3))

              precision    recall  f1-score   support

    negative      0.400     0.667     0.500         3
     neutral      0.500     0.333     0.400         3
    positive      0.667     0.500     0.571         4

    accuracy                          0.500        10
   macro avg      0.522     0.500     0.490        10
weighted avg      0.537     0.500     0.499        10



In [17]:
df = pd.DataFrame()
df['gold'] = evaluation['sentiment']
df['pred'] = y_pred
df['sentence'] = evaluation['text']
df

Unnamed: 0,gold,pred,sentence
0,negative,negative,It took eight years for Warner Brothers to rec...
1,positive,positive,All the New York University students love this...
2,negative,negative,This Italian place is really trendy but they h...
3,positive,neutral,"In conclusion, my review of this book would be..."
4,neutral,negative,The story of this movie is focused on Carl Bra...
5,neutral,negative,Chris O'Donnell stated that while filming for ...
6,positive,positive,My husband and I moved to Amsterdam 6 years ag...
7,positive,negative,Dame Maggie Smith performed her role excellent...
8,neutral,neutral,The new movie by Mr. Kruno was shot in New Yor...
9,negative,positive,"I always have loved English novels, but I just..."
