In [3]:
import pandas as pd
# Assuming you have already imported necessary libraries and cleaned your text data as mentioned before
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [4]:
df = pd.read_csv('../data/interest-location.csv')
df

Unnamed: 0,text,location,interest
0,attempt hari tu act like smart deep tech entre...,,Technology
1,mosti cradle ni clueless whats going malaysias...,,Technology
2,hightech exports country china 942b hong kong ...,,Technology
3,hightech exports country china 942b hong kong ...,,Technology
4,tbh malaysia many tech talents dont actually n...,,Technology
...,...,...,...
28835,smartwatch one better sport ea garmin amazfit ...,Terengganu,Sports
28836,abearfromsea heaven football maracana,Terengganu,Sports
28837,king pele former youth sport minister khairykj...,Terengganu,Sports
28838,love football,Terengganu,Sports


In [5]:
df.isna().sum()

text            1
location    12700
interest        0
dtype: int64

In [6]:
missing_values = df['text'].isna().sum()
print("Number of missing values:", missing_values)

Number of missing values: 1


In [7]:
df.dropna(subset=['text'], inplace=True)
df.isna().sum()

text            0
location    12699
interest        0
dtype: int64

In [8]:
df['text'].fillna("No text available", inplace=True)

In [9]:
# Assuming you have a DataFrame 'df' with 'text' and 'label' columns
X = df['text']  # Features
y = df['interest']  # Labels

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now you have:
# X_train: Training text data
# y_train: Corresponding training labels
# X_test: Testing text data
# y_test: Corresponding testing labe

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=29000, stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

<23071x29000 sparse matrix of type '<class 'numpy.float64'>'
	with 341077 stored elements in Compressed Sparse Row format>

In [11]:
# Initialize and train the SVM classifier
svm_classifier = SVC(kernel='linear')  # You can try different kernels (linear, rbf, etc.)
svm_classifier.fit(tfidf_train, y_train)

In [14]:
y_test

23920           Sports
25159           Sports
12011    Entertainment
25378           Sports
18407        Education
             ...      
28480           Sports
3739        Technology
25273           Sports
12517    Entertainment
25960           Sports
Name: interest, Length: 5768, dtype: object

In [20]:
# Predict on the test set
# y_pred = svm_classifier.predict(tfidf_train)
y_pred

array(['Sports', 'Sports', 'Entertainment', ..., 'Sports',
       'Entertainment', 'Sports'], dtype=object)

In [15]:
# Make predictions on the test data
y_pred = svm_classifier.predict(tfidf_test)

# Evaluate the classifier using the test data
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

    Education       0.95      0.95      0.95      1003
Entertainment       0.96      0.97      0.97      1151
   FoodnDrink       0.97      0.96      0.97       817
     Politics       0.97      0.97      0.97       755
       Sports       0.98      0.98      0.98      1116
   Technology       0.96      0.96      0.96       926

     accuracy                           0.96      5768
    macro avg       0.97      0.96      0.96      5768
 weighted avg       0.96      0.96      0.96      5768



In [16]:
# Evaluate the classifier
# print(classification_report(y_test, y_pred))

In [18]:
# Split your dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the number of features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the SVM classifier
svm_classifier = SVC(kernel='linear')  # You can try different kernels (linear, rbf, etc.)
svm_classifier.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = svm_classifier.predict(X_test_tfidf)

# Evaluate the classifier
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

    Education       0.95      0.94      0.95      1003
Entertainment       0.93      0.96      0.94      1151
   FoodnDrink       0.95      0.95      0.95       817
     Politics       0.96      0.96      0.96       755
       Sports       0.97      0.96      0.96      1116
   Technology       0.95      0.94      0.94       926

     accuracy                           0.95      5768
    macro avg       0.95      0.95      0.95      5768
 weighted avg       0.95      0.95      0.95      5768

