In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, f1_score

import keras
import tensorflow as tf

from nltk.corpus import stopwords
import re

from sklearn.model_selection import train_test_split

from bs4 import BeautifulSoup             

2023-11-06 15:57:04.067072: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("data/testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("data/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

unlabeled_train.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [3]:
def review_to_words( raw_review ):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))  

In [4]:
# Get the number of reviews based on the dataframe column size
num_reviews = unlabeled_train["review"].size

print("Cleaning and parsing the training set movie reviews...\n")

clean_train_reviews = []
for i in range( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print("Review %d of %d\n" % ( i+1, num_reviews ))                                                                    
    clean_train_reviews.append( review_to_words( unlabeled_train["review"][i] ))

# add clean reviews to train data frame
unlabeled_train['clean_review'] = clean_train_reviews

Cleaning and parsing the training set movie reviews...



  review_text = BeautifulSoup(raw_review).get_text()


Review 1000 of 50000

Review 2000 of 50000

Review 3000 of 50000

Review 4000 of 50000

Review 5000 of 50000

Review 6000 of 50000

Review 7000 of 50000

Review 8000 of 50000

Review 9000 of 50000

Review 10000 of 50000

Review 11000 of 50000

Review 12000 of 50000

Review 13000 of 50000

Review 14000 of 50000

Review 15000 of 50000

Review 16000 of 50000

Review 17000 of 50000

Review 18000 of 50000

Review 19000 of 50000

Review 20000 of 50000

Review 21000 of 50000

Review 22000 of 50000

Review 23000 of 50000

Review 24000 of 50000

Review 25000 of 50000

Review 26000 of 50000

Review 27000 of 50000

Review 28000 of 50000

Review 29000 of 50000

Review 30000 of 50000

Review 31000 of 50000

Review 32000 of 50000

Review 33000 of 50000

Review 34000 of 50000

Review 35000 of 50000

Review 36000 of 50000

Review 37000 of 50000

Review 38000 of 50000

Review 39000 of 50000

Review 40000 of 50000

Review 41000 of 50000

Review 42000 of 50000

Review 43000 of 50000

Review 44000 of 5000

In [5]:
unlabeled_train.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers obvious made bunch frien...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",saw film years ago remember particularly nasty...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilersin new york joan barnard elvire ...
3,"""7161_0""","""I went to see this film with a great deal of ...",went see film great deal excitement school dir...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes agree everyone site movie bad even call mo...


In [6]:
print("Creating the bag of words...\n")

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word", 
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...



In [7]:
from sklearn.cluster import KMeans 

# From the graph we can see that the optimal number of clusters is 3
clusterNum = 2
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(train_data_features)
labels = k_means.labels_
print(labels)

[1 1 1 ... 0 1 1]


In [8]:
# assign the labels to the data
unlabeled_train["sentiment"] = labels
unlabeled_train.head()

Unnamed: 0,id,review,clean_review,sentiment
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers obvious made bunch frien...,1
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",saw film years ago remember particularly nasty...,1
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilersin new york joan barnard elvire ...,1
3,"""7161_0""","""I went to see this film with a great deal of ...",went see film great deal excitement school dir...,1
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes agree everyone site movie bad even call mo...,1


In [10]:
X_train,X_test,y_train,y_test=train_test_split(train_data_features, unlabeled_train["sentiment"], stratify=unlabeled_train['sentiment'], test_size=0.3, random_state=42)

In [11]:
print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( X_train, y_train )

ypred = forest.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, ypred))

Training the random forest...
Accuracy:  0.9063333333333333


In [12]:
# Verify that there are 25,000 rows and 2 columns
print(test.shape)

# Create an empty list and append the clean reviews one by one
num_reviews = len(test["review"])
clean_test_reviews = [] 

print("Cleaning and parsing the test set movie reviews...\n")
for i in range(0, num_reviews):
    if( (i+1) % 1000 == 0 ):
        print("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

(25000, 2)
Cleaning and parsing the test set movie reviews...



  review_text = BeautifulSoup(raw_review).get_text()


Review 1000 of 25000

Review 2000 of 25000

Review 3000 of 25000

Review 4000 of 25000

Review 5000 of 25000

Review 6000 of 25000

Review 7000 of 25000

Review 8000 of 25000

Review 9000 of 25000

Review 10000 of 25000

Review 11000 of 25000

Review 12000 of 25000

Review 13000 of 25000

Review 14000 of 25000

Review 15000 of 25000

Review 16000 of 25000

Review 17000 of 25000

Review 18000 of 25000

Review 19000 of 25000

Review 20000 of 25000

Review 21000 of 25000

Review 22000 of 25000

Review 23000 of 25000

Review 24000 of 25000

Review 25000 of 25000



In [13]:
# Use the random forest to make sentiment label predictions
forest_final = RandomForestClassifier(n_estimators = 100) 
forest_final = forest_final.fit( train_data_features, unlabeled_train["sentiment"] )
result = forest_final.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "data/BoWunlabeledData.csv", index=False, quoting=3 )