In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [4]:
data = pd.read_csv('IMDB Dataset.csv')

In [5]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [34]:
len(data)

35000

In [6]:
data['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [7]:
data = data.sample(35000, random_state=21)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35000 entries, 26733 to 25758
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     35000 non-null  object
 1   sentiment  35000 non-null  object
dtypes: object(2)
memory usage: 820.3+ KB


In [9]:
data['sentiment'] = data['sentiment'].replace({'positive': 1, 'negative': 0})

In [10]:
import re
# Function to clean html tags
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean,'', text)

In [11]:
data['review'] = data['review'].apply(clean_html)

In [12]:
def convert_lower(text):
    return text.lower()

In [14]:
data['review'] = data['review'].apply(convert_lower)

In [15]:
def remove_special(text):
    x=''
    
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x = x+''
    return x        

In [16]:
data['review'] = data['review'].apply(remove_special)

In [17]:
import nltk

from nltk.corpus import stopwords

In [18]:
def remove_stopwords(text):
    x=[]
    for i in text.split():
        
        if i not in stopwords.words('english'):
            x.append(i)
            
    y=x[:]
    x.clear()
    return y

In [19]:
data['review'] = data['review'].apply(remove_stopwords)

In [20]:
data.head()

Unnamed: 0,review,sentiment
26733,[iwonderhowmanyminicooperautomobilesweresoldth...,1
33996,[aparnasens15parkavenueisafilmaboutnatureofrea...,1
20423,[althoughitsdefinitelyanenjoyablewaytospendaco...,1
17716,[unlikewhatonereviewersaidthisisnotaripoffofma...,0
49103,[whileitstruethatthemovieissomewhatinteresting...,0


In [39]:
data['sentiment'].value_counts()

1    17591
0    17409
Name: sentiment, dtype: int64

In [38]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [22]:
y=[]
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z

In [23]:
data['review'] = data['review'].apply(stem_words)

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
data['review'] = data['review'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Apply TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['review']).toarray()

In [25]:
y = data['sentiment'].values

In [26]:
X.shape

(35000, 5000)

In [27]:
y.shape

(35000,)

In [28]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [50]:
train_data = pd.DataFrame({'review': x_train.tolist(), 'sentiment': y_train.tolist()})

# Combine the test features and target into a single DataFrame
test_data = pd.DataFrame({'review': x_test.tolist(), 'sentiment': y_test.tolist()})

# Save the training data to a CSV file
train_data.to_csv('imdb_train.csv', index=False)

# Save the test data to a CSV file
test_data.to_csv('imdb_test.csv', index=False)



In [29]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model1 = GaussianNB()
model2 = MultinomialNB()
model3 = BernoulliNB()
model4 = LogisticRegression()

In [30]:
model1.fit(x_train, y_train)
prediction1 = model1.predict(x_test)
accuracy1 = accuracy_score(y_test, prediction1)
print(accuracy1)

0.5087142857142857


In [31]:
model2.fit(x_train, y_train)
prediction2 = model2.predict(x_test)
accuracy2 = accuracy_score(y_test, prediction2)
print(accuracy2)

0.5087142857142857


In [32]:
model3.fit(x_train, y_train)
prediction3 = model3.predict(x_test)
accuracy3 = accuracy_score(y_test, prediction3)
print(accuracy3)

0.5087142857142857


In [33]:
model4.fit(x_train, y_train)
prediction4 = model4.predict(x_test)
accuracy4 = accuracy_score(y_test, prediction3)
print(accuracy4)

0.5087142857142857
