## Install/Import

In [None]:
import os
import numpy as np
import pandas as pd
import string
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import urllib.request
import zipfile

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
#!python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_sm')

import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

# Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Data

In [None]:
cd /content/drive/MyDrive/NLP Module Project

/content/drive/.shortcut-targets-by-id/16NFwxd7Q9SIt5z2tPHNwMUDjYvt-NV3e/NLP Module Project


In [None]:
data = pd.read_csv("./reviews.csv")
data

Unnamed: 0,Id,Review,Label
0,0,good and interesting,5
1,1,"This class is very helpful to me. Currently, I...",5
2,2,like!Prof and TAs are helpful and the discussi...,5
3,3,Easy to follow and includes a lot basic and im...,5
4,4,Really nice teacher!I could got the point eazl...,4
...,...,...,...
107013,107013,Trendy topic with talks from expertises in the...,4
107014,107014,"Wonderful! Simple and clear language, good ins...",5
107015,107015,an interesting and fun course. thanks. dr quincy,5
107016,107016,"very broad perspective, up to date information...",4


# Prepare Dataset

In [None]:
X = data['Review']
y = data['Label']

#Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
X_train

2840                               Very informative course.
101291    Excellent professor. I´m glad I had the chance...
75324                             Good course for startup .
7649      Very helpful. Straightforward and practical. I...
54289     I found this course to be the most enjoyable s...
                                ...                        
46088     After the first two random videos I watched I ...
79647     I love this course! First of all, the teacher ...
106686    Challenging. The big puzzle combined out of 4 ...
74766     A little time-wasting for those who already ha...
39542     Dr. Chuck is always a pleasure to listen to. A...
Name: Review, Length: 85614, dtype: object

# Pre-process text

In [None]:
def tokenize(sentence,method='spacy'):
# Tokenize and lemmatize text, remove stopwords and punctuation

    punctuations = string.punctuation
    stopwords = list(STOP_WORDS)

    if method=='nltk':
        # Tokenize
        tokens = nltk.word_tokenize(sentence,preserve_line=True)
        # Remove stopwords and punctuation
        tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
        # Lemmatize
        wordnet_lemmatizer = WordNetLemmatizer()
        tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens]
        tokens = " ".join([i for i in tokens])
    else:
        # Tokenize
        with nlp.select_pipes(enable=['tokenizer','lemmatizer']):
            tokens = nlp(sentence)
        # Lemmatize
        tokens = [word.lemma_.lower().strip() for word in tokens]
        # Remove stopwords and punctuation
        tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
        tokens = " ".join([i for i in tokens])
    return tokens

In [None]:
train_df = X_train
test_df = X_test

# Process the training set text
tqdm.pandas()
train_df['processed_text'] = train_df.progress_apply(lambda x: tokenize(x,method='nltk'))

# Process the test set text
tqdm.pandas()
test_df['processed_text'] = test_df.progress_apply(lambda x: tokenize(x,method='nltk'))

100%|██████████| 85614/85614 [00:31<00:00, 2706.76it/s]
100%|██████████| 21404/21404 [00:07<00:00, 2893.41it/s]


In [None]:
def build_features(train_data, test_data, ngram_range, method='count'):
    if method == 'tfidf':
        # Create features using TFIDF
        vec = TfidfVectorizer(ngram_range=ngram_range)
        X_train = vec.fit_transform(train_df['processed_text'])
        X_test = vec.transform(test_df['processed_text'])

    else:
        # Create features using word counts
        vec = CountVectorizer(ngram_range=ngram_range)
        X_train = vec.fit_transform(train_df['processed_text'])
        X_test = vec.transform(test_df['processed_text'])

    return X_train, X_test

In [None]:
# Create features
method = 'tfidf'
ngram_range = (1, 2)
X_train,X_test = build_features(train_df['processed_text'],test_df['processed_text'],ngram_range,method)


# Train model

In [None]:
# Train a classification model using logistic regression classifier
logreg_model = LogisticRegression(solver='saga')
logreg_model.fit(X_train,y_train)
preds = logreg_model.predict(X_train)
acc = sum(preds==y_train)/len(y_train)
print('Accuracy on the training set is {:.3f}'.format(acc))

Accuracy on the training set is 0.841


# Evaluate model

In [None]:
# Evaluate accuracy on the test set
test_preds = logreg_model.predict(X_test)
test_acc = sum(test_preds==y_test)/len(y_test)
print('Accuracy on the test set is {:.3f}'.format(test_acc))

Accuracy on the test set is 0.778
