In [1]:
# Import the IMDb and AG News datasets from Hugging Face and storing them as pandas dataframes
from datasets import load_dataset # IMDB and AG News datasets
import pandas as pd # pandas dataframes
import numpy as np # obtaining accuracy
from sklearn.metrics import confusion_matrix
imdb_dataset = load_dataset("imdb")
imdb_train = pd.DataFrame(imdb_dataset['train'])
imdb_test = pd.DataFrame(imdb_dataset['test'])
ag_news_dataset = load_dataset("ag_news")
ag_news_train = pd.DataFrame(ag_news_dataset['train'])
ag_news_test = pd.DataFrame(ag_news_dataset['test'])

# Illustrating the first 5 observations in the training and test sets
print(imdb_train.head())
print(imdb_test.head())
print(ag_news_train.head())
print(ag_news_test.head())

# Generating a sample of observations from each dataset
# Can remove later
imdb_train = imdb_train.groupby('label').apply(lambda x: x.sample(1000, random_state=123))
imdb_test = imdb_test.groupby('label').apply(lambda x: x.sample(1000, random_state=123))
ag_news_train = ag_news_train.groupby('label').apply(lambda x: x.sample(1000, random_state=123))
ag_news_test = ag_news_test.groupby('label').apply(lambda x: x.sample(1000, random_state=123))

Reusing dataset imdb (C:\Users\Jean-Pierre\.cache\huggingface\datasets\imdb\plain_text\1.0.0\2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Using custom data configuration default
Reusing dataset ag_news (C:\Users\Jean-Pierre\.cache\huggingface\datasets\ag_news\default\0.0.0\bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0
                                                text  label
0  I love sci-fi and am willing to put up with a ...      0
1  Worth the entertainment value of a rental, esp...      0
2  its a totally average film with a few semi-alr...      0
3  STAR RATING: ***** Saturday Night **** Friday ...      0
4  First off let me say, If you haven't enjoyed a...      0
                                                text  label
0  Wall St. Bears Claw Back Into the Black (Reute...      2
1  Carlyle Looks Toward Commercial Aerospace (Reu...      2
2  Oil and Economy Cloud Stocks' Outlook (Reuters...      2
3  Iraq Halts Oil Exports from Main Sout

In [2]:
print(imdb_train.head())
print(imdb_test.head())
print(ag_news_train.head())
print(ag_news_test.head())

                                                          text  label
label                                                                
0     9271   Half Past Dead, starring Steven Seagal in the ...      0
      7395   Being a Film studies graduate I would like to ...      0
      793    Towards the end of this thriller Ally Sheedy's...      0
      9374   this movie was banned in england? why? tom sav...      0
      10712  The movie was not a waste except for some bori...      0
                                                          text  label
label                                                                
0     9271   It is not uncommon for a celebrity to be faced...      0
      7395   I like Kevin Bacon and Cathy Moriarty, and I l...      0
      793    Every great romantic comedy needs conflict bet...      0
      9374   This film has absolutely no redeeming features...      0
      10712  As a nice anecdote to one of the above comment...      0
                    

In [3]:
# Creating the document-term matrices
from sklearn.feature_extraction.text import CountVectorizer
imdb_train_docs = imdb_train['text'].tolist()
vectoriser = CountVectorizer()
imdb_train_dtm = vectoriser.fit_transform(imdb_train_docs)
imdb_train_dtm = pd.DataFrame(imdb_train_dtm.toarray(), columns=vectoriser.get_feature_names())
imdb_test_docs = imdb_test['text'].tolist()
imdb_test_dtm = vectoriser.transform(imdb_test_docs)
imdb_test_dtm = pd.DataFrame(imdb_test_dtm.toarray(), columns=vectoriser.get_feature_names())

In [4]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
imdb_nb = nb.fit(imdb_train_dtm, imdb_train['label'].tolist())
imdb_nb_preds = imdb_nb.predict(imdb_test_dtm)
print(np.mean(imdb_nb_preds == imdb_test['label'].tolist()))
confusion_matrix(imdb_test['label'].tolist(), imdb_nb_preds)

0.5875


array([[603, 397],
       [428, 572]], dtype=int64)

In [5]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver = 'liblinear')
imdb_logreg = logreg.fit(imdb_train_dtm, imdb_train['label'].tolist())
imdb_logreg_preds = imdb_logreg.predict(imdb_test_dtm)
print(np.mean(imdb_logreg_preds == imdb_test['label'].tolist()))
confusion_matrix(imdb_test['label'].tolist(), imdb_logreg_preds)

0.827


array([[827, 173],
       [173, 827]], dtype=int64)

In [6]:
from sklearn.linear_model import SGDClassifier
svm = SGDClassifier(random_state=123)
imdb_svm = svm.fit(imdb_train_dtm, imdb_train['label'].tolist())
imdb_svm_preds = imdb_svm.predict(imdb_test_dtm)
print(np.mean(imdb_svm_preds == imdb_test['label'].tolist()))
confusion_matrix(imdb_test['label'].tolist(), imdb_svm_preds)

0.8245


array([[846, 154],
       [197, 803]], dtype=int64)

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=123)
imdb_rf = rf.fit(imdb_train_dtm, imdb_train['label'].tolist())
imdb_rf_preds = imdb_rf.predict(imdb_test_dtm)
print(np.mean(imdb_rf_preds == imdb_test['label'].tolist()))
confusion_matrix(imdb_test['label'].tolist(), imdb_rf_preds)

0.8145


array([[810, 190],
       [181, 819]], dtype=int64)