## Sentiment Analysis Model

In [None]:
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import ast
import joblib

import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def group_rating(rating):
    if rating <= 4:
        return 1
    elif 4 < rating <= 7:
        return 2
    else:
        return 3

def text_preprocessing_pipeline(df, text_column):
    # replace char
    replacements = {
        '&#039;': "'",
        '&rsquo;': "'",
        '&amp;': "&",
        '&quot;': '"'
    }
    for key, value in replacements.items():
        df[text_column] = df[text_column].str.replace(key, value, regex=False)
        
    # tokenize
    df['tokens'] = df[text_column].apply(word_tokenize)

    # convert to lowercase and remove non-alphabetic characters
    df['tokens'] = df['tokens'].apply(lambda tokens: [token.lower() for token in tokens if token.isalpha()])

    # remove punctuation
    table = str.maketrans('', '', string.punctuation)
    df['tokens'] = df['tokens'].apply(lambda words: [word.translate(table) for word in words])

    # remove stop words
    stop_words = set(stopwords.words('english'))
    df['tokens'] = df['tokens'].apply(lambda words: [word for word in words if word not in stop_words])

    # stem
    porter = PorterStemmer()
    df['stemmed'] = df['tokens'].apply(lambda words: [porter.stem(word) for word in words])

    # apply group_rating() to rating column
    df['class'] = df['rating'].apply(group_rating)

    # convert stemmed words back to string for vectorization
    df['stemmed'] = df['stemmed'].apply(lambda x: ' '.join(x))


    return df

# Load the trained model and vectorizer
svm_model = joblib.load('svm_model.pkl')
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

def classify_text(df):
    # preprocess text
    df = text_preprocessing_pipeline(df, 'review')
    
    # vectorization
    X = tfidf_vectorizer.transform(df['stemmed'])
    
    # classify via SVM
    df['class'] = svm_model.predict(X)
    df['class'] = df['class'] + 1  # Adjust class labels to be 1-3 instead of 0-2

    return df

# load data into df
file_path = 'csv file'
df = pd.read_csv(file_path)

# classify data
classified_df = classify_text(df)
classified_df.drop([col for col in classified_df.columns if 'Unnamed' in col], axis=1, inplace=True) # drop unnamed columns

In [None]:
# display classified df
classified_df