In [7]:
import csv
import math
import random
import numpy as np
import pandas as pd
#load data from csv
data= pd.read_csv(open("Your own path",errors='ignore'))
#drop usless data
df=data[["type","review","label"]][data['label']!='unsup']

In [8]:
df

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg
...,...,...,...
49995,train,"Seeing as the vote average was pretty low, and...",pos
49996,train,"The plot had some wretched, unbelievable twist...",pos
49997,train,I am amazed at how this movie(and most others ...,pos
49998,train,A Christmas Together actually came before my t...,pos


In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
#lemmatizer=WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def data_preprocessing(df,stop_word,stem):
    df_tmp=df.copy()
    for i in range(len(df)):
        word_tokens = nltk.wordpunct_tokenize(df.review[i].lower())
        if(stop_word==True):
            df_tmp.review[i] = [w for w in word_tokens if not w in stop_words]
        else:
            df_tmp.review[i] = word_tokens
        if(stem==True):
            for j in range(len(df_tmp.review[i])):
                df_tmp.review[i][j] =  PorterStemmer().stem(df_tmp.review[i][j])
        if(df_tmp.label[i]=='pos'):
            df_tmp.label[i]=1
        else:
            df_tmp.label[i]=0
    train_data=df_tmp[df_tmp.type=='train']
    test_data=df_tmp[df_tmp.type=='test']
    return train_data,test_data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\i7546\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [11]:
#use neither stop words or lemmatization
train_data1,test_data1=data_preprocessing(df,stop_word=False,stem=False)
train_data1=train_data1.reset_index()

In [12]:
#use only stop words
train_data2,test_data2=data_preprocessing(df,stop_word=True,stem=False)
train_data2=train_data2.reset_index()

In [13]:
#use stop words and lemmatization
train_data3,test_data3=data_preprocessing(df,stop_word=True,stem=True)
train_data3=train_data3.reset_index()

In [14]:
def train(train_data):
    num={}#number of positive cases and negative cases
    p={}#probability of positive words and negative words
    v=set()#number of total words
    word_counts={}#number of positive words and negative words
    num['pos']=len(train_data[train_data.label==1])
    num['neg']=len(train_data[train_data.label==0])
    p['pos']=math.log(num['pos']/len(train_data))
    p['neg']=math.log(num['neg']/len(train_data))
    word_counts['pos'] = {}
    word_counts['neg'] = {}
    for i in range(len(train_data)):
        count_word={}
        for j in train_data.review[i]:
            count_word[j]=count_word.get(j,0.0)+1.0
        for x,y in count_word.items():
            if x not in v:
                v.add(x)
            if(train_data.label[i]==1):
                if x not in word_counts['pos']:
                    word_counts['pos'][x]=0.0
                else:
                    word_counts['pos'][x]+=y
            if(train_data.label[i]==0):
                if x not in word_counts['neg']:
                    word_counts['neg'][x]=0.0
                else:
                    word_counts['neg'][x]+=y
    return num,p,v,word_counts

In [15]:
num1,p1,v1,word_counts1=train(train_data1)

In [16]:
num2,p2,v2,word_counts2=train(train_data2)

In [17]:
num3,p3,v3,word_counts3=train(train_data3)

In [12]:
def predict(test_data,num,p,v,word_count):
    result=[]
    for i in range(len(test_data)):
        count_word={}
        pos_score,neg_score=0,0
        for j in test_data.review[i]:#count test review's words
            count_word[j]=count_word.get(j,0.0)+1.0
        for m,n in count_word.items(): 
            pos_score+=math.log((word_count['pos'].get(m,0.0)+1)/(num['pos']+len(v)))
            neg_score+=math.log((word_count['neg'].get(m,0.0)+1)/(num['neg']+len(v)))
        pos_score+=p['pos']
        neg_score+=p['neg']
        if pos_score>neg_score:
            result.append(1)
        else:
            result.append(0)
    return result

In [13]:
pred1=predict(test_data1,num1,p1,v1,word_counts1)

In [14]:
pred2=predict(test_data2,num2,p2,v2,word_counts2)

In [15]:
pred3=predict(test_data3,num3,p3,v3,word_counts3)

In [16]:
def f1(pred,labels):
    TP,FP,FN=0,0,0
    for i in range(len(pred)):
        if(pred[i]==labels[i]):
            TP=TP+1
        elif(pred[i]!=labels[i] and pred[i]==1):
            FP=FP+1
        elif(pred[i]!=labels[i] and pred[i]==0):
            FN=FN+1
    precision=TP/(TP+FP)
    recall=TP/(TP+FN)
    result=2*precision*recall/(precision+recall)
    return result

In [17]:
print("F1-Score:"+str(f1(pred1, test_data1.label)))

F1-Score:0.9077484326190007


In [18]:
print("F1-Score:"+str(f1(pred2, test_data2.label)))

F1-Score:0.9111498257839722


In [19]:
print("F1-Score:"+str(f1(pred3, test_data3.label)))

F1-Score:0.9081777486625179
