In [1]:
import nltk
import pandas as pd
import spacy
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
# prepare training dataset
print("Compiling training datasets...")
movie_df = pd.read_csv('./topic-datasets/movie_reviews.csv')
movie_df['topic'] = 'movie'
movie_df

restaurant_df = pd.read_csv(
    './topic-datasets/restaurant-reviews.tsv', sep='\t')
restaurant_df['text'] = restaurant_df['Review']
restaurant_df['topic'] = 'restaurant'
restaurant_df

book_df = pd.read_csv('./topic-datasets/book-reviews.csv')
book_df['text'] = book_df['ReviewContent']
book_df['topic'] = 'book'
book_df

# even out the number of instances of each topic
lowest_num_rows = min([len(movie_df), len(restaurant_df), len(book_df)])
mixed_df = pd.concat([
    movie_df[['text', 'topic']].sample(lowest_num_rows, axis='index'),
    restaurant_df[['text', 'topic']].sample(lowest_num_rows, axis='index'),
    book_df[['text', 'topic']].sample(lowest_num_rows, axis='index'),
])
mixed_df.to_csv('./topic-datasets/mixed-reviews.csv')

In [2]:
dataset = pd.read_csv('./topic-datasets/mixed-reviews.csv')
dataset

Unnamed: 0.1,Unnamed: 0,text,topic
0,17933,""" well , there's mr . myagi . . . """,movie
1,42520,"for ever , i sat in anticipation for a decent ...",movie
2,62218,it even shows them in bed together .,movie
3,48179,"schwarzenegger is awful , i mean really , real...",movie
4,11546,"rose goes out onto the ship's ledge , and we t...",movie
...,...,...,...
2995,2025,A good read from the first page. It is a myst...,book
2996,2939,"It's the same thing over and over and over! ""I...",book
2997,4625,"The first third of the book I found ponderous,...",book
2998,2293,The plot was intriguing with well developed ch...,book


In [3]:
vectorizer = TfidfVectorizer(min_df=1, tokenizer=nltk.word_tokenize)
vectorizer.fit(dataset['text']) # fit vectorizer to all data

# split dataframe into train and test data
train, test = train_test_split(dataset, test_size=0.1)

# vectorize data
train_vectors = vectorizer.transform(train['text'])
test_vectors = vectorizer.transform(test['text'])



In [4]:
# train a linear classifier
lsvc = svm.LinearSVC()
lsvc.fit(train_vectors, train['topic'])

# make predictions and generate classification report
y_pred = lsvc.predict(test_vectors)
print(classification_report(test['topic'], y_pred))

              precision    recall  f1-score   support

        book       0.99      0.96      0.98       105
       movie       0.91      0.93      0.92       100
  restaurant       0.91      0.92      0.91        95

    accuracy                           0.94       300
   macro avg       0.94      0.94      0.94       300
weighted avg       0.94      0.94      0.94       300



In [6]:
official_test_df = pd.read_csv('./test-datasets/sentiment-topic-final-test.tsv', sep='\t')
official_test_vectors = vectorizer.transform(official_test_df['text'])

# make predictions and generate classification report
official_y_pred = lsvc.predict(official_test_vectors)
print(classification_report(official_test_df['topic'], official_y_pred))

              precision    recall  f1-score   support

        book       1.00      1.00      1.00         2
       movie       0.83      1.00      0.91         5
  restaurant       1.00      0.67      0.80         3

    accuracy                           0.90        10
   macro avg       0.94      0.89      0.90        10
weighted avg       0.92      0.90      0.89        10



In [7]:
official_test_df['topic_pred'] = official_y_pred
official_test_df[['text', 'topic', 'topic_pred']]

Unnamed: 0,text,topic,topic_pred
0,It took eight years for Warner Brothers to rec...,movie,movie
1,All the New York University students love this...,restaurant,movie
2,This Italian place is really trendy but they h...,restaurant,restaurant
3,"In conclusion, my review of this book would be...",book,book
4,The story of this movie is focused on Carl Bra...,movie,movie
5,Chris O'Donnell stated that while filming for ...,movie,movie
6,My husband and I moved to Amsterdam 6 years ag...,restaurant,restaurant
7,Dame Maggie Smith performed her role excellent...,movie,movie
8,The new movie by Mr. Kruno was shot in New Yor...,movie,movie
9,"I always have loved English novels, but I just...",book,book


In [8]:
test_string = """
Write anything here to classify it as one of the 3 topics 
"""

test_vector = vectorizer.transform([test_string])
prediction = lsvc.predict(test_vector)
prediction

array(['movie'], dtype=object)