In [1]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\raman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\raman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\raman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
def load_data(filename):
    return pd.read_csv(filename)

train_data = load_data('Train.csv')
test_data = load_data('Test.csv')
valid_data = load_data('Valid.csv')

In [4]:
def analyze_data(data):
    print("Data Analysis:")
    print("Number of rows:", len(data))
    print("Positive reviews:", len(data[data['label'] == 1]))
    print("Negative reviews:", len(data[data['label'] == 0]))
    print("Null values in 'text' column:", data['text'].isnull().sum())
    print("Null values in 'label' column:", data['label'].isnull().sum())

print("Train Data:")
analyze_data(train_data)

print("\nTest Data:")
analyze_data(test_data)

print("\nValidation Data:")
analyze_data(valid_data)

Train Data:
Data Analysis:
Number of rows: 40000
Positive reviews: 19981
Negative reviews: 20019
Null values in 'text' column: 0
Null values in 'label' column: 0

Test Data:
Data Analysis:
Number of rows: 5000
Positive reviews: 2505
Negative reviews: 2495
Null values in 'text' column: 0
Null values in 'label' column: 0

Validation Data:
Data Analysis:
Number of rows: 5000
Positive reviews: 2514
Negative reviews: 2486
Null values in 'text' column: 0
Null values in 'label' column: 0


In [5]:
def preprocess_text(text):
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Tokenize
    words = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

train_data['text'] = train_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)
valid_data['text'] = valid_data['text'].apply(preprocess_text)

In [6]:
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(train_data['text'])
y_train = train_data['label']

X_test = tfidf_vectorizer.transform(test_data['text'])
y_test = test_data['label']


In [7]:
n_jobs = -1
k_fold_splits = 5
random_forest_classifier = RandomForestClassifier(n_jobs=n_jobs)
random_forest_classifier.fit(X_train, y_train)

In [8]:
y_pred_test = random_forest_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)

print("\nTest Set Metrics:")
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)


Test Set Metrics:
Accuracy: 0.8578
Recall: 0.8530938123752495
Precision: 0.8616935483870968


In [9]:
X_valid = tfidf_vectorizer.transform(valid_data['text'])
y_valid = valid_data['label']
y_pred_valid = random_forest_classifier.predict(X_valid)

accuracy_valid = accuracy_score(y_valid, y_pred_valid)
recall_valid = recall_score(y_valid, y_pred_valid)
precision_valid = precision_score(y_valid, y_pred_valid)

print("\nValidation Set Metrics:")
print("Accuracy:", accuracy_valid)
print("Recall:", recall_valid)
print("Precision:", precision_valid)


Validation Set Metrics:
Accuracy: 0.8438
Recall: 0.8448687350835322
Precision: 0.8445328031809145


In [10]:
def predict_sentiment(review):
    # Preprocess the user input review
    preprocessed_review = preprocess_text(review)

    # Vectorize the review using TF-IDF
    X_review = tfidf_vectorizer.transform([preprocessed_review])

    # Make a prediction using the trained classifier
    prediction = random_forest_classifier.predict(X_review)

    # Return the result as a string
    if prediction[0] == 1:
        return "Positive"
    else:
        return "Negative"


In [1]:
# Test the function with user input
user_review = input("Enter your review: ")
print("Entered review:",user_review)
sentiment = predict_sentiment(user_review)
print("The review is:", sentiment)

Entered review: first half was good, seconf half was cringe, overall one time watch


NameError: name 'predict_sentiment' is not defined