In [160]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import matplotlib as plt
from scipy import sparse
import numpy as np
from sklearn.utils import class_weight


In [161]:
def binarize(num):
    return 0 if num <= 0 else 1

In [162]:
# load data
df = pd.read_csv('C:\\Users\\Jerem\\Documents\\Spring 2023\\HourlyWork\\CF-dataset.csv', encoding='utf8')
print('CF:', df['Label'].tolist().count(1))
print('Not CF', df['Label'].tolist().count(0))

CF: 653
Not CF 8794


In [163]:
n_samples = 1200
df_label_0 = df[df['Label'] == 0]  # select only rows with label == 0
df_label_0_sample = df_label_0.sample(n=n_samples, random_state=42)  # randomly sample N rows

df_label_1 = df[df['Label'] == 1]  # select only rows with label == 1
sample_df = pd.concat([df_label_0_sample, df_label_1])

# # Use the sample df
X = sample_df['Text'].to_list()
Y = sample_df['Label'].to_list()

moods = sample_df['Moods'].tolist()
moods_binarized = [mood.count('Sub') for mood in moods]
si_presence = [binarize(text.count('si')) for text in X]


sub_count = []
for i in range(len(Y)):
    if Y[i] == 1:
        if 'Sub' in moods[i]:
            count = int(moods[i][moods[i].index('Sub'):moods[i].index('Sub')+7][-1])
        else:
            count = 0

    if Y[i] == 0:
        if 'Sub' in moods[i]:
            count = int(moods[i][moods[i].index('Sub'):moods[i].index('Sub')+7][-1])
        else:
            count = 0
    sub_count.append(count)


In [164]:
# Use sub presence
# split data into training and testing sets
X_train, X_test, y_train, y_test, mood_train, mood_test, si_train, si_test = train_test_split(X, 
                                                                                              Y, 
                                                                                              moods_binarized, 
                                                                                              si_presence, 
                                                                                              test_size=0.3, 
                                                                                              random_state=42, 
                                                                                              stratify=Y, 
                                                                                              shuffle=True)
print('All CF:',Y.count(1))
print('All not CF:', Y.count(0))
print('Training data: {}'.format(len(X_train)))
print('Testing data: {}'.format(len(X_test)))
print('Training distribution:', 'Not-CF:', y_train.count(0), 'CF', y_train.count(1))
print('Testing distribution:', 'Not-CF:', y_test.count(0), 'CF', y_test.count(1))


class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=[0,1], y=Y)
print(class_weights)


# create TF-IDF vectorizer with n-grams
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(2, 3), encoding='utf-8')
tfidf_char = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 3), encoding='utf-8')

# fit and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train)
char_train = tfidf_char.fit_transform(X_train)

print(X_train_tfidf.shape, char_train.shape)
X_train_tfidf = sparse.hstack((X_train_tfidf, np.array(mood_train)[:,None], np.array(si_train)[:,None]))
X_train_tfidf = sparse.hstack([X_train_tfidf, char_train])
print(X_train_tfidf.shape)


# transform the testing data using the same vectorizer
X_test_tfidf = tfidf.transform(X_test)
char_test = tfidf_char.transform(X_test)
print(char_test, X_test_tfidf.shape)

X_test_tfidf = sparse.hstack((X_test_tfidf, np.array(mood_test)[:,None], np.array(si_test)[:,None]))
X_test_tfidf = sparse.hstack([X_test_tfidf, char_test])
print(X_test_tfidf.shape)



All CF: 653
All not CF: 1200
Training data: 1297
Testing data: 556
Training distribution: Not-CF: 840 CF 457
Testing distribution: Not-CF: 360 CF 196
[0.77208333 1.41883614]
(1297, 82043) (1297, 4279)
(1297, 86324)
  (0, 4223)	0.0645143765149309
  (0, 4222)	0.06320447449831673
  (0, 4093)	0.05008706661039327
  (0, 4092)	0.04642467652660271
  (0, 4013)	0.0617158939822226
  (0, 4012)	0.06092468749080152
  (0, 3862)	0.03195984896846458
  (0, 3733)	0.1686498531896496
  (0, 3730)	0.06646813999282902
  (0, 3554)	0.05277908179998496
  (0, 3550)	0.06533359497350706
  (0, 3542)	0.15399666343947896
  (0, 3541)	0.19990639771464444
  (0, 3523)	0.07665300537954782
  (0, 3520)	0.06580025355607075
  (0, 3505)	0.07953822051679377
  (0, 3499)	0.05559556763870498
  (0, 3440)	0.12435376469803837
  (0, 3435)	0.041369992338215006
  (0, 3421)	0.09119176608819284
  (0, 3420)	0.0710034151538282
  (0, 3390)	0.08085708304303289
  (0, 3380)	0.03406426889243867
  (0, 3375)	0.07329154412979208
  (0, 3361)	0.035328

In [165]:
# #Use Sub Count

# # split data into training and testing sets
# X_train, X_test, y_train, y_test, sub_count_train, sub_count_test, si_train, si_test = train_test_split(X, 
#                                                                                               Y, 
#                                                                                               sub_count, 
#                                                                                               si_presence, 
#                                                                                               test_size=0.2, 
#                                                                                               random_state=42, 
#                                                                                               stratify=Y, 
#                                                                                               shuffle=True)
# print('All CF:',Y.count(1))
# print('All not CF:', Y.count(0))
# print('Training data: {}'.format(len(X_train)))
# print('Testing data: {}'.format(len(X_test)))
# print('Training distribution:', 'Not-CF:', y_train.count(0), 'CF', y_train.count(1))
# print('Testing distribution:', 'Not-CF:', y_test.count(0), 'CF', y_test.count(1))

# class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=[0,1], y=Y)
# print(class_weights)


# # create TF-IDF vectorizer with n-grams
# tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 3), encoding='utf-8')

# # fit and transform the training data
# X_train_tfidf = tfidf.fit_transform(X_train)
# print(X_train_tfidf.shape)

# X_train_tfidf = sparse.hstack((X_train_tfidf, np.array(sub_count_train)[:,None], np.array(si_train)[:,None]))
# print(X_train_tfidf.shape)


# # transform the testing data using the same vectorizer
# X_test_tfidf = tfidf.transform(X_test)
# print(X_test_tfidf.shape)

# X_test_tfidf = sparse.hstack((X_test_tfidf, np.array(sub_count_test)[:,None], np.array(si_test)[:,None]))
# print(X_test_tfidf.shape)

In [166]:
from sklearn.model_selection import GridSearchCV

# define SVM classifier
svm = SVC(class_weight={0: class_weights[0], 1: class_weights[1]})

# define hyperparameter grid
param_grid = {
    'C': [0.1, 1, 5, 10],
    'gamma': ['auto', 'scale'],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'decision_function_shape' : ['ovo']
}

# perform grid search
grid_search = GridSearchCV(svm, param_grid, scoring='f1', verbose=3, n_jobs=3)
                           
grid_search.fit(X_train_tfidf, y_train)

# get best hyperparameters and accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_
print(best_params, best_accuracy)


Fitting 5 folds for each of 32 candidates, totalling 160 fits
{'C': 5, 'decision_function_shape': 'ovo', 'gamma': 'auto', 'kernel': 'linear'} 0.8189960801341476


In [167]:
# train SVM classifier with best hyperparameters
svm = SVC(class_weight={0: class_weights[0], 1: class_weights[1]}, **best_params)
svm.fit(X_train_tfidf, y_train)

# predict on the testing data and calculate accuracy
y_pred = svm.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy*100)

# print classification report
print(classification_report(y_test, y_pred))

Accuracy: 87.41007194244604
              precision    recall  f1-score   support

           0       0.93      0.88      0.90       360
           1       0.79      0.87      0.83       196

    accuracy                           0.87       556
   macro avg       0.86      0.87      0.87       556
weighted avg       0.88      0.87      0.88       556

