In [16]:
from pathlib import Path
ROOT = Path().resolve().parents[0]
DATA = ROOT / "data"


In [17]:
import pandas as pd

df = pd.read_csv(DATA/"imdb_reviews.csv")
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


<h3>Data Preprocessing</h3>

In [18]:
import nltk

df['review'] = df['review'].str.lower()
df['review'] = df['review'].astype(str)
df['review'] = df['review'].str.replace(r'<[^<>]*>', ' ', regex=True)
print(df['review'])

# df['tokens'] = df['review'].apply(nltk.word_tokenize)
# stopwords = nltk.corpus.stopwords.words('english')
# df['tokens'].apply(lambda x: [word for word in x if word not in stopwords])

0        one of the other reviewers has mentioned that ...
1        a wonderful little production.   the filming t...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object


## Split Dataset

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)

X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()

X_train_vectors = vectorizer.fit_transform(X_train)
# X_train_vectors = sc.fit_transform(X_train_vectors)

X_test_vectors = vectorizer.transform(X_test)
# X_test_vectors = sc.fit_transform(X_test_vectors)




## Model Training

In [20]:
from sklearn.svm import LinearSVC
model = LinearSVC(verbose=1)
model.fit(X_train_vectors, y_train)

y_pred = model.predict(X_test_vectors)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

[LibLinear]....*
optimization finished, #iter = 43
Objective value = -7189.552097
nSV = 20674
Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      4961
    positive       0.90      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Accuracy Score: 0.903
