In [21]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [22]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/content/IMDB Dataset.csv')
data.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [23]:
import re

def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

data['review'] = data['review'].apply(normalize_text)


In [24]:
def remove_noise(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data['review'] = data['review'].apply(remove_noise)


In [25]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [18]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [26]:
# Text lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

data['review'] = data['review'].apply(lemmatize_text)



In [27]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewer ha mentioned that af...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this wa a wonderful way to spend tim...,positive
3,basically there a family where a little boy ja...,negative
4,petter matteis love in the time of money is a ...,positive


In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [28]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

data['review'] = data['review'].apply(remove_stopwords)


In [29]:
data.head()

Unnamed: 0,review,sentiment
0,one reviewer ha mentioned watching oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wa wonderful way spend time hot summer...,positive
3,basically family little boy jake think zombie ...,negative
4,petter matteis love time money visually stunni...,positive


In [30]:
data.to_csv('cleaned_imdb_reviews.csv', index=False)

In [31]:
df = pd.read_csv('/content/cleaned_imdb_reviews.csv')

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(df['review'])


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['review'])


In [39]:
df.dropna(inplace=True)

In [40]:
df.head(5)

Unnamed: 0,review,sentiment
0,one reviewer ha mentioned watching oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wa wonderful way spend time hot summer...,positive
3,basically family little boy jake think zombie ...,negative
4,petter matteis love time money visually stunni...,positive


In [41]:
df.to_csv('processed_imdb_reviews.csv', index=False)

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the pre-processed dataset
data = pd.read_csv('processed_imdb_reviews.csv')

# Split the dataset into training and testing sets
X = data['review']
y = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)  # Convert sentiments to binary labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting features for performance reasons
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)



In [43]:
# Train an SVM model on the training data
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train_tfidf, y_train)

In [44]:
# Evaluate the model on the test data
y_pred = svm_model.predict(X_test_tfidf)

# Print accuracy score and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8848
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.87      0.88      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.88     10000
   macro avg       0.89      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [45]:
# Save the model and vectorizer for future use
import joblib
joblib.dump(svm_model, 'svm_sentiment_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']