# SVM Baseline Model

In [1]:
import pandas as pd
import numpy as np

#Data preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

#Naive Bayes
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

#Seed for reproducibility
import random

seed_value=42
random.seed(seed_value)
np.random.seed(seed_value)

In [2]:
%pip install google

Note: you may need to restart the kernel to use updated packages.


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

ModuleNotFoundError: No module named 'google'

## Read data 

In [2]:
# Import processed data
df = pd.read_csv("../data/processed/cleaned_data_1.csv", on_bad_lines='skip')
num_nan = df.isna().sum()
df = df.dropna()

Tweets        0
AuthorID      0
CreatedAt     0
text_clean    1
text_len      0
dtype: int64


In [3]:
possible_labels = df['AuthorID'].unique()
#convert labels into numeric values
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

df['label'] = df.AuthorID.replace(label_dict)

labels = list(label_dict.values())

# Get the max len of tweets
max_len = np.max(df['text_len'])

## Data split

In [4]:
# Split data into train, validation, and test datasets
X = df['text_clean']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=seed_value)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=seed_value)

In [5]:
(unique, counts) = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)).T)

[[   0 4412]
 [   1 5806]
 [   2 6532]
 [   3 3688]
 [   4 1009]
 [   5 4457]
 [   6  652]
 [   7 4394]
 [   8 1370]]


We can conclude from the above cell that the classes are unbalanced. We oversample the training set to enforce the same count of tweets as the most active player on Twitter.

In [16]:
# Re-sample data 

ros = RandomOverSampler()
X_train, y_train = ros.fit_resample(np.array(X_train).reshape(-1, 1), np.array(y_train).reshape(-1, 1))
train_os = pd.DataFrame(list(zip([x[0] for x in X_train], y_train)), columns = ['text_clean', 'label'])

X_train = train_os['text_clean'].values
y_train = train_os['label'].values

In [17]:
(unique, counts) = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)).T)

[[   0 6532]
 [   1 6532]
 [   2 6532]
 [   3 6532]
 [   4 6532]
 [   5 6532]
 [   6 6532]
 [   7 6532]
 [   8 6532]]


## SVM Model

In [18]:
# CountVectorizer to create bag of words.

clf = CountVectorizer()
X_train_cv =  clf.fit_transform(X_train)
X_test_cv = clf.transform(X_test)

In [19]:
# Apply TF-IDF to attribute weights to words based on their frequency.

tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_cv)
X_train_tf = tf_transformer.transform(X_train_cv)
X_test_tf = tf_transformer.transform(X_test_cv)

### Train

In [22]:
# Initialize Support Vector Machine Model
model = svm.SVC()

# Train model
model.fit(X_train_tf, y_train)

## Test

In [13]:
# Get model predictions for test data
svm_pred = model.predict(X_test_tf)

In [14]:
print('Validation set predictions:', svm_pred)

Validation set predictions: [5 0 8 ... 2 1 3]


In [23]:
# Print classification report for model performance against the test dataset

labels_string = map(str, labels)
labels_string = list(labels_string)
print('Classification Report for SVM:\n',classification_report(y_test, svm_pred, target_names=labels_string))

Classification Report for SVM:
               precision    recall  f1-score   support

           0       0.54      0.60      0.57      1225
           1       0.57      0.61      0.59      1613
           2       0.51      0.55      0.53      1815
           3       0.51      0.48      0.50      1024
           4       0.88      0.65      0.75       280
           5       0.56      0.61      0.58      1238
           6       0.39      0.18      0.24       181
           7       0.61      0.56      0.58      1221
           8       0.59      0.34      0.43       381

    accuracy                           0.56      8978
   macro avg       0.57      0.51      0.53      8978
weighted avg       0.56      0.56      0.55      8978



In [25]:
# Save model

import pickle
f = open('../models/svm_model.pickle', 'wb')
pickle.dump(nb_clf, f)
f.close()