In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.pipeline import Pipeline  # Import the Pipeline class

In [11]:
# Task #1: Load the dataset into a pandas DataFrame
dataset_path = '/content/moviereviews2.tsv'
df = pd.read_csv(dataset_path, sep='\t')

In [12]:
# Task #2: Check for missing values
nan_values = df.isnull().sum()
whitespace_strings = (df == ' ').sum()

In [13]:
# Task #3: Remove NaN values
df.dropna(inplace=True)

In [14]:
# Task #4: Quick look at the label column
label_counts = df['label'].value_counts()

In [15]:
# Task #5: Split the data into train & test sets
X = df['review']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [16]:
# Task #6: Build a pipeline to vectorize the data and train a model
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])
text_clf.fit(X_train, y_train)

In [17]:
# Task #7: Run predictions and analyze the results
predictions = text_clf.predict(X_test)
confusion_mat = confusion_matrix(y_test, predictions)
classification_rep = classification_report(y_test, predictions)
accuracy = accuracy_score(y_test, predictions)


In [18]:
# Print results
print("Task #2: NaN Values\n", nan_values)
print("\nTask #2: Whitespace Strings\n", whitespace_strings)
print("\nTask #4: Label Counts\n", label_counts)
print("\nTask #7: Confusion Matrix\n", confusion_mat)
print("\nTask #7: Classification Report\n", classification_rep)
print("\nTask #7: Accuracy\n", accuracy)

Task #2: NaN Values
 label      0
review    20
dtype: int64

Task #2: Whitespace Strings
 label     0
review    0
dtype: int64

Task #4: Label Counts
 pos    2990
neg    2990
Name: label, dtype: int64

Task #7: Confusion Matrix
 [[900  91]
 [ 63 920]]

Task #7: Classification Report
               precision    recall  f1-score   support

         neg       0.93      0.91      0.92       991
         pos       0.91      0.94      0.92       983

    accuracy                           0.92      1974
   macro avg       0.92      0.92      0.92      1974
weighted avg       0.92      0.92      0.92      1974


Task #7: Accuracy
 0.9219858156028369
