In [1]:
import os
import pandas as pd

In [2]:
import os
import pandas as pd

# Load the dataset
def load_data(directory):
    data = {'review': [], 'sentiment': []}
    for sentiment in ['pos', 'neg']:
        path = os.path.join(directory, sentiment)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as f:
                data['review'].append(f.read())
                data['sentiment'].append(1 if sentiment == 'pos' else 0)
    return pd.DataFrame(data)

train_data = load_data("train")

In [3]:
train_data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     25000 non-null  object
 1   sentiment  25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [5]:
# i. Convert feature variable to lower case

train_data['text'] = train_data['review'].str.lower()

In [6]:
# ii. Tokenization

from nltk.tokenize import word_tokenize
train_data['text'] = train_data['text'].apply(word_tokenize)

In [7]:
# iii. Punctuation removal

import string

def remove_punctuation(token):
    return [i for i in token if i not in string.punctuation]
train_data['text'] = train_data['text'].apply(remove_punctuation)

In [8]:
"""
text = "I love %^Python & in a way, but. / is not ? rewmet @# all in all  !"

clean = text.translate(str.maketrans('','',string.punctuation))
print(clean)

the first str ('') specifies characters that need to be replaced
the second str ('') specifies characters with which they are to be replaced
the third str (string.punctuation) specifies xters that are to be deleted

"""

'\ntext = "I love %^Python & in a way, but. / is not ? rewmet @# all in all  !"\n\nclean = text.translate(str.maketrans(\'\',\'\',string.punctuation))\nprint(clean)\n\nthe first str (\'\') specifies characters that need to be replaced\nthe second str (\'\') specifies characters with which they are to be replaced\nthe third str (string.punctuation) specifies xters that are to be deleted\n\n'

In [9]:
# v. Remove stopwords

from nltk.corpus import stopwords
remove_stopwords = stopwords.words('english')

def remove_words(tokens):
    return [i for i in tokens if i not in remove_stopwords]
train_data['text'] = train_data['text'].apply(remove_words)

In [10]:
# iv. Lemmatization   - I opted for Lemmatization in place of Stemming

from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

def lemmatize_data(token):
    return [lemma.lemmatize(i) for i in token]
train_data['text'] = train_data['text'].apply(lemmatize_data)

In [11]:
# vi. Feature Extraction

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = CountVectorizer()
train_data['text'] = train_data['text'].apply(lambda x: ' '.join(x))
X = vectorizer.fit_transform(train_data['text'])

print(vectorizer.vocabulary_)



In [12]:
# i. Split dataset into train and test sets

from sklearn.model_selection import train_test_split
y = train_data.sentiment
x_train, x_val, y_train, y_val = train_test_split(X, y, train_size = 0.9, random_state = 42)

In [13]:
# ii. Apply Naive Bayes Algorithm

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

model = MultinomialNB()
model.fit(x_train, y_train)

In [14]:
test_data = load_data("test")

In [15]:
test_data['sentiment'].value_counts()

sentiment
1    12500
0    12500
Name: count, dtype: int64

In [16]:
test_data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [17]:
test_data['test'] = test_data['review'].str.lower()

test_data['test'] = test_data['test'].apply(word_tokenize)

In [18]:
test_data['test'] = test_data['test'].apply(remove_punctuation)

In [19]:
test_data['test'] = test_data['test'].apply(remove_words)

In [20]:
test_data['test'] = test_data['test'].apply(lemmatize_data)

In [21]:
test_data['test'] = test_data['test'].apply(lambda x: ' '.join(x))
x_test = vectorizer.fit_transform(test_data['test'])
y_test = test_data.sentiment

In [22]:
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))

ValueError: X has 68253 features, but MultinomialNB is expecting 69303 features as input.