In [6]:
# Importing the libraries
import pandas as pd #Library for dealing with the dataset
import re  # Library to clean the data
import nltk # Natural Language Tool Kit
from nltk.corpus import stopwords  # To Remove Stopwords
from nltk.stem.porter import PorterStemmer # For Stemming Purpose

In [7]:
# Importing the dataset
data = pd.read_csv("movie.csv")

In [8]:
# Checking top 5 rows of the dataset
data.head()

Unnamed: 0,class,text
0,Pos,films adapted from comic books have had plent...
1,Pos,every now and then a movie comes along from a...
2,Pos,you ve got mail works alot better than it des...
3,Pos,jaws is a rare film that grabs your atte...
4,Pos,moviemaking is a lot like being the general m...


In [9]:
# Checking the shape of the dataset
data.shape

(2000, 2)

In [10]:
# Initialize empty array
# To append clean list
corpus = []
# 2000 reviews(rows) to clean 
for i in range(0,2000):
    # Row ith column: 'Review'
    review = re.sub('[^a-zA-Z]', " ", data["text"][i])   
    
    # Converting all the cases to lower
    review = review.lower()
    
    # Splitting to array
    review = review.split()
    
    # Creating porterstemmer object to
    # using main stem of each word
    ps = PorterStemmer()
    
    # Using for loop to stem each word in the string array to its Ith row
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    
    # Rejoin all string arrays to create back into a string
    review = ' '.join(review)
    
    # Append each string to create array of clean text
    corpus.append(review)

In [11]:
# Creating the bag of word model
from sklearn.feature_extraction.text import CountVectorizer

# To extract a total number of 2500 features
# here max_features is an attribute to get better result
cv = CountVectorizer(max_features = 2500)

In [34]:
# Splitting the dataset into features and label

# Features is the dependent varibles
features = cv.fit_transform(corpus).toarray()

# label has the answer( if the review the positive or negative)
label = data.iloc[:,0].values

In [26]:
# Splitting the dataset into training set and test set 
from sklearn.cross_validation import train_test_split

# Setting the test size equal to 20%
features_train, features_test, label_train, label_test = train_test_split(features, label, test_size= 0.20, random_state = 0)

In [25]:
# Using Randomforest classification for traing and testing of the data
from sklearn.ensemble import RandomForestClassifier

# n_estimators can be said as the number of trees, experiment with it to get better result
clf = RandomForestClassifier(n_estimators = 150, criterion = 'entropy', random_state=0) 

# Fitting the training set to the classifier
clf.fit(features_train, label_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [27]:
# Predicating the test set results
label_pred = clf.predict(features_test)

In [28]:
# To check the accuracy of the model, confusion matrix is needed

# Making The Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(label_test, label_pred)

In [29]:
# Checking the model accuracy with the predicted and test values
score = accuracy_score(label_pred, label_test)

In [30]:
# Model accuracy
print(score*100)

84.5


In [31]:
# Trying a new review for the classification
new = "I don't like this movie"

In [32]:
# Following all the necessary steps, showed earlier
review = re.sub('[^a-zA-Z]', ' ', new)
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)

new = cv.transform([review]).toarray()

pred = clf.predict(new)


In [33]:
# Final classified review
print(pred)

['Neg']
