In [1]:
import pandas as pd
import math
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk 
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swnet
stop = stopwords.words('english')

[nltk_data] Downloading package wordnet to C:\Users\Abdul
[nltk_data]     Hannan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv('Movies_TV.txt', delimiter = '\t')
df.head()

Unnamed: 0,Domain,Label,Rating,Review
0,Movies_TV,POS,5,my boy love this film . sometime my youngest g...
1,Movies_TV,NEU,3,on my disk the last scene of episode 2 : New E...
2,Movies_TV,POS,4,I have a 4yr old son and he love this cartoon ...
3,Movies_TV,POS,5,this sequal be wonderful . the animation be ex...
4,Movies_TV,POS,5,I really hope sci-fi never take off the doctor...


In [3]:
# df['Review']

In [4]:
#filtering the data-set
df = df.replace('[""-:!",?.\n()]', '', regex=True)
df = df.replace("""["`']""", '', regex=True)
df['Review'] = df['Review'].str.lower()
df['Review'] = df['Review'].str.strip()

In [5]:
#removing the stopwords from the data-set so that we only get useful associations among words
df['Review'] = df['Review'].apply(lambda words: ' '.join([word for word in words.split() if word not in stop]))

In [6]:
df['Review'].head()

0    boy love film sometime youngest get scared cap...
1    disk last scene episode new earth miss instead...
2         yr old son love cartoon buy story intresting
3    sequal wonderful animation excellent though ma...
4    really hope scifi never take doctor best show ...
Name: Review, dtype: object

In [7]:
x_train, x_test, y_train, y_test = train_test_split(df['Review'], df['Rating'], test_size=0.33, random_state=42)

In [8]:
x_train.size

670

In [9]:
y_train.size

670

In [10]:
x_train.head()

703    relevance film strikingly frightening today un...
311    fly fly fly imagination take far away land nev...
722    well box claim get magic wonder suspense origi...
629    watch dr since tom baker show david tennent ar...
0      boy love film sometime youngest get scared cap...
Name: Review, dtype: object

In [None]:
#using sentiwordnet to classify 

In [11]:
#make a function that takes reviews as input and for every word in review
#it finds its positive and negative score. Keep count of all the positive, negative
#and neutral words so we can compute the rating using them.

In [25]:
total_reviews = x_train.size
synset1 = wn.synsets('positive')[0]
synset2 = wn.synsets('negative')[0]
def classifier():
    correctClassified = 0
    false_pos = 0
    for i in range(total_reviews):
        pos = neg = neut = 0
        review = df.iloc[i]['Review']
        review_size = 0
        tokenList = word_tokenize(''.join(review))
#         print(review)
        
        for token in tokenList:
            rating = 5 
            if not(wn.synsets(token)):
                continue
                
            word = wn.synsets(token)[0]
            synset = swnet.senti_synset(word.name())
            posDistance = synset.pos_score()
            negDistance = synset.neg_score()
            review_size = review_size + 1 
            
            if(posDistance > negDistance):
                pos = pos + 1
            elif(negDistance > posDistance):
                neg = neg + 1
            elif(posDistance == 0 and negDistance == 0):
                pos = pos + 1 # for neutral
                
#         print('Size', review_size)
#         print('+ve', pos)
#         print('-ve', neg)
        
        if(pos > neg):
            percentage = math.floor((neg / 100) * review_size)
#             percentage = math.ceil(neg / review_size)
            if(percentage > rating):
                percentage = percentage / pos
            deduct = math.floor(rating - percentage)
            updated_rating = deduct
           
            
        if(neg == 0):
            updated_rating = rating
        
        if(updated_rating == y_train.iloc[i]):
            correctClassified = correctClassified + 1
        else:
            false_pos = false_pos + 1
            
#         print('Actual Rating', y_train.iloc[i])
#         print('Rating', updated_rating)
        tokenList = ''
#         print("..........")
#     print('Corrected',correctClassified)
    print("Accuracy : ", (correctClassified / total_reviews * 1.0))
  

In [26]:
classifier()

Accuracy :  0.4059701492537313


In [None]:
#using different classifiers

In [38]:
#Reading data
data = open("Movies_TV.txt").read()
data = data.split('\n')
data.remove(data[0])
data.remove(data[-1])
data.remove(data[-1])

In [39]:
reviews = []
y = []
for item in data:
    _, _, rating, text = item.split('\t')
    reviews.append(text)
    y.append(rating)

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(max_df = 600, min_df = 5, ngram_range = (1,3), max_features = 80)
X = vec.fit_transform(reviews)
X = X.toarray()

In [76]:
# vec.vocabulary_

In [77]:
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(X, y, shuffle = True, train_size=0.7)

In [78]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [79]:
lc = SGDClassifier()
nbc = GaussianNB()
dtc = DecisionTreeClassifier()
knnc = KNeighborsClassifier()

In [80]:
lc.fit(trainX, trainY)
nbc.fit(trainX, trainY)
dtc.fit(trainX, trainY)
knnc.fit(trainX, trainY)

KNeighborsClassifier()

In [81]:
pred_y_lc = lc.predict(testX)
pred_y_nbc = nbc.predict(testX)
pred_y_dtc = dtc.predict(testX)
pred_y_knnc = knnc.predict(testX)

In [82]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
lc_acc = accuracy_score(testY, pred_y_lc)
nbc_acc = accuracy_score(testY, pred_y_nbc)
dtc_acc = accuracy_score(testY, pred_y_dtc)
knnc_acc = accuracy_score(testY, pred_y_knnc)

In [83]:
print("Linear Classifier: ", lc_acc)
print("Naive Bayes Classifier: ", nbc_acc)
print("Decision Tree Classifier: ", dtc_acc)
print("KNN Classifier: ", knnc_acc)

Linear Classifier:  0.64
Naive Bayes Classifier:  0.43333333333333335
Decision Tree Classifier:  0.53
KNN Classifier:  0.62


In [84]:
from sklearn.metrics import precision_recall_fscore_support
Macro=precision_recall_fscore_support(testY, pred_y_lc, average='macro')
Micro=precision_recall_fscore_support(testY,pred_y_lc, average='micro')
Weighted=precision_recall_fscore_support(testY,pred_y_lc, average='weighted')

print("Classification Report for Linear Classifier")
print('.....................')
print('Macro')
print('Percision, recall, f1-score', Macro)
print('Micro')
print('Percision, recall, f1-score', Micro)
print('Weighted')
print('Percision, recall, f1-score', Weighted)

Classification Report for Linear Classifier
.....................
Macro
Percision, recall, f1-score (0.35210112710112706, 0.2900998199967272, 0.30221033868092695, None)
Micro
Percision, recall, f1-score (0.64, 0.64, 0.64, None)
Weighted
Percision, recall, f1-score (0.6109778284778284, 0.64, 0.611306357694593, None)


In [85]:
from sklearn.metrics import precision_recall_fscore_support
Macro=precision_recall_fscore_support(testY, pred_y_nbc, average='macro')
Micro=precision_recall_fscore_support(testY,pred_y_nbc, average='micro')
Weighted=precision_recall_fscore_support(testY,pred_y_nbc, average='weighted')

print("Classification Report for Naive Bayes")
print('.....................')
print('Macro')
print('Percision, recall, f1-score', Macro)
print('Micro')
print('Percision, recall, f1-score', Micro)
print('Weighted')
print('Percision, recall, f1-score', Weighted)

Classification Report for Naive Bayes
.....................
Macro
Percision, recall, f1-score (0.3040588130405064, 0.38309687448862706, 0.2967875381708436, None)
Micro
Percision, recall, f1-score (0.43333333333333335, 0.43333333333333335, 0.43333333333333335, None)
Weighted
Percision, recall, f1-score (0.6175245549644863, 0.43333333333333335, 0.48711419968693936, None)


In [86]:
from sklearn.metrics import precision_recall_fscore_support
Macro=precision_recall_fscore_support(testY, pred_y_dtc, average='macro')
Micro=precision_recall_fscore_support(testY,pred_y_dtc, average='micro')
Weighted=precision_recall_fscore_support(testY,pred_y_dtc, average='weighted')

print("Classification Report for Decision Tree")
print('.....................')
print('Macro')
print('Percision, recall, f1-score', Macro)
print('Micro')
print('Percision, recall, f1-score', Micro)
print('Weighted')
print('Percision, recall, f1-score', Weighted)

Classification Report for Decision Tree
.....................
Macro
Percision, recall, f1-score (0.2789665398091835, 0.27409998363606614, 0.27546768581909253, None)
Micro
Percision, recall, f1-score (0.53, 0.53, 0.53, None)
Weighted
Percision, recall, f1-score (0.5430537165979352, 0.53, 0.5359209946843667, None)


In [88]:
from sklearn.metrics import precision_recall_fscore_support
Macro=precision_recall_fscore_support(testY, pred_y_knnc, average='macro')
Micro=precision_recall_fscore_support(testY,pred_y_knnc, average='micro')
Weighted=precision_recall_fscore_support(testY,pred_y_knnc, average='weighted')

print("Classification Report for KNN")
print('.....................')
print('Macro')
print('Percision, recall, f1-score', Macro)
print('Micro')
print('Percision, recall, f1-score', Micro)
print('Weighted')
print('Percision, recall, f1-score', Weighted)

Classification Report for KNN
.....................
Macro
Percision, recall, f1-score (0.3140146381133098, 0.2808991981672394, 0.2865009882198486, None)
Micro
Percision, recall, f1-score (0.62, 0.62, 0.62, None)
Weighted
Percision, recall, f1-score (0.5931571338212704, 0.62, 0.6003479818196088, None)
