In [4]:
import pandas as pd

# Trying to  read the CSV with ISO-8859-1 encoding
df = pd.read_csv('/content/corpus.csv', encoding='ISO-8859-1')

# Display the first few rows
print(df.head())

                                                text        label
0   Stuning even for the non-gamer: This sound tr...  __label__2 
1   The best soundtrack ever to anything.: I'm re...  __label__2 
2   Amazing!: This soundtrack is my favorite musi...  __label__2 
3   Excellent Soundtrack: I truly like this sound...  __label__2 
4   Remember, Pull Your Jaw Off The Floor After H...  __label__2 


In [6]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into string
    return ' '.join(tokens)

# Apply preprocessing to the text column
df['text'] = df['text'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(df['text'])

In [8]:
from sklearn.model_selection import train_test_split

# Extract labels
y = df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
from sklearn.svm import SVC

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear')  # You can choose other kernels like 'rbf'

# Train the classifier
svm_classifier.fit(X_train, y_train)

In [10]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
y_pred = svm_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)


Accuracy: 0.8553
Classification Report:
              precision    recall  f1-score   support

 __label__1        0.86      0.86      0.86      1518
 __label__2        0.86      0.85      0.85      1482

    accuracy                           0.86      3000
   macro avg       0.86      0.86      0.86      3000
weighted avg       0.86      0.86      0.86      3000

