In [1]:
import pandas as pd
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('Movies_TV.txt', header = 0, delimiter = '\t')

In [3]:
# function to sanitize the data (works for only one column, but is faster)
def sanitize_data(df, column_name):
    df[column_name] = df[column_name].str.strip() # trim the text
    df[column_name] = df[column_name].str.lower() # text to lowercase
    
    df = df.replace('[!`:?";\d\.,()–-]', '', regex = True) # remove non-textual characters
    df = df.replace('\n', ' ', regex = True) # replace \n with a space
    
    # remove stop words
    stop_words = stopwords.words('english')
    df[column_name] = df[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    
    return df

df = sanitize_data(df, 'Review')
df['Predicted Rating'] = 0
df

Unnamed: 0,Domain,Label,Rating,Review,Predicted Rating
0,Movies_TV,POS,5,boy love film sometime youngest get scared cap...,0
1,Movies_TV,NEU,3,disk last scene episode new earth miss instead...,0
2,Movies_TV,POS,4,yr old son love cartoon buy story intresting,0
3,Movies_TV,POS,5,sequal wonderful animation excellent though ma...,0
4,Movies_TV,POS,5,really hope scifi never take doctor best show ...,0
...,...,...,...,...,...
995,Movies_TV,POS,5,perfect way get insipred get workout let go te...,0
996,Movies_TV,POS,4,smith wrong prove movie make one wonder eye ''...,0
997,Movies_TV,POS,4,smith watch movie want shut blind turn compute...,0
998,Movies_TV,POS,5,really think quality could improve older film ...,0


In [4]:
from nltk.corpus import wordnet as wnet
from nltk.corpus import sentiwordnet as swnet
from math import ceil
from math import floor

In [5]:
threshold = 0.01 # threshold to ceil or floor the rating
objective_weight = 5 # 0-5 weight to give to objective scores in affecting the rating
num_of_reviews = len(df)

for i in range(num_of_reviews):
    pos_count, neg_count, obj_count = 0, 0, 0
    words = df.iloc[i]['Review'].split(' ')
    
    for token in words:
        if not wnet.synsets(token): continue
        word = wnet.synsets(token)[0]
        senti_word = swnet.senti_synset(word.name())
        pos_score, neg_score, obj_score = senti_word.pos_score(), senti_word.neg_score(), senti_word.obj_score()
        
        if pos_score > neg_score:
            pos_count += 1
        elif neg_score > pos_score:
            neg_count += 1
        else:
            obj_count += 1
    
    # Positive percent + objective percent to make up the rating out of 5  
    float_rating = (pos_count/len(words)) * 5 + (obj_count/len(words)) * objective_weight
    rating = ceil(float_rating) if (float_rating % 1) > threshold else floor(float_rating)
    df.at[[i], 'Predicted Rating'] = rating

df

Unnamed: 0,Domain,Label,Rating,Review,Predicted Rating
0,Movies_TV,POS,5,boy love film sometime youngest get scared cap...,5
1,Movies_TV,NEU,3,disk last scene episode new earth miss instead...,4
2,Movies_TV,POS,4,yr old son love cartoon buy story intresting,5
3,Movies_TV,POS,5,sequal wonderful animation excellent though ma...,4
4,Movies_TV,POS,5,really hope scifi never take doctor best show ...,4
...,...,...,...,...,...
995,Movies_TV,POS,5,perfect way get insipred get workout let go te...,5
996,Movies_TV,POS,4,smith wrong prove movie make one wonder eye ''...,4
997,Movies_TV,POS,4,smith watch movie want shut blind turn compute...,5
998,Movies_TV,POS,5,really think quality could improve older film ...,5


In [6]:
correctly_predicted = len(df[df['Predicted Rating'] == df['Rating']])
incorrectly_predicted = len(df[df['Predicted Rating'] != df['Rating']])

accuracy = correctly_predicted / num_of_reviews
print('Accuracy ->', accuracy)

# Since there is no boolean label to predict
# False positives will be actual rating below and equal to 3 that got a rating of above 3
false_positives = len(df[(df['Rating'] <= 3) & (df['Predicted Rating'] > 3)])
precision = correctly_predicted / (correctly_predicted + false_positives)
print('Precision ->', precision)

# False negatives will be actual rating above 3 that got a rating of below or equal to 3
false_negatives = len(df[(df['Rating'] > 3) & (df['Predicted Rating'] <= 3)])
precision = correctly_predicted / (correctly_predicted + false_negatives)
print('Recall ->', precision)

f1_score = 2 * correctly_predicted / (2 * correctly_predicted + false_positives + false_negatives)
print('F1 ->', f1_score)

Accuracy -> 0.492
Precision -> 0.743202416918429
Recall -> 0.9781312127236581
F1 -> 0.8446351931330472
