In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import matplotlib as plt
from scipy import sparse
import numpy as np
from sklearn.utils import class_weight


In [12]:
def binarize(num):
    return 0 if num <= 0 else 1

In [30]:
# load data
df = pd.read_csv('C:\\Users\\Jerem\\Documents\\Spring 2023\\HourlyWork\\CF-dataset.csv', encoding='utf8')
print(df['Label'].tolist().count(1))
print(df['Label'].tolist().count(0))

653
8794


In [32]:
df_label_0 = df[df['Label'] == 0]  # select only rows with label == 0
df_label_0_sample = df_label_0.sample(n=653, random_state=42)  # randomly sample 653 rows

df_label_1 = df[df['Label'] == 1]  # select only rows with label == 1

print(df_label_1)

sample_df = pd.concat([df_label_0_sample, df_label_1])

# # Use the sample df
# X = sample_df['Text'].to_list()
# Y = sample_df['Label'].to_list()


                                                  Text  \
0    sólo quisiera dártela monda y desnuda sin el o...   
1    a oh quién de tus deseos y librea alma y cuerp...   
2    a oh quién tan castamente se escapara del seño...   
3    con estas razones perdía el pobre caballero el...   
4    pero con todo alababa en su autor aquel acabar...   
..                                                 ...   
648  capítulo lxxiv de cómo don quijote cayó malo y...   
649  oyolo don quijote con ánimo sosegado pero no l...   
650  yo me siento sobrina a punto de muerte querría...   
651  entró el escribano con los demás y después de ...   
652  viendo lo cual el cura pidió al escribano le d...   

                              Moods  \
0              {'Ind': 1, 'Sub': 1}   
1              {'Ind': 1, 'Sub': 1}   
2              {'Ind': 1, 'Sub': 1}   
3              {'Ind': 1, 'Sub': 4}   
4              {'Ind': 3, 'Sub': 3}   
..                              ...   
648            {'Ind': 9, 'Sub

In [24]:
X = df['Text'].to_list()
Y = df['Label'].to_list()
moods = df['Moods'].tolist()
moods_binarized = [mood.count('Sub') for mood in moods]
si_presence = [binarize(text.count('si')) for text in X]
print(moods_binarized)

[1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 

In [22]:
# split data into training and testing sets
X_train, X_test, y_train, y_test, mood_train, mood_test, si_train, si_test = train_test_split(X, 
                                                                                              Y, 
                                                                                              moods_binarized, 
                                                                                              si_presence, 
                                                                                              test_size=0.3, 
                                                                                              random_state=42, 
                                                                                              stratify=Y, 
                                                                                              shuffle=True)
print('All CF:',Y.count(1))
print('All not CF:', Y.count(0))
print('Training data: {}'.format(len(X_train)))
print('Testing data: {}'.format(len(X_test)))
print('Training distribution:', 'Not-CF:', y_train.count(0), 'CF', y_train.count(1))
print('Testing distribution:', 'Not-CF:', y_test.count(0), 'CF', y_test.count(1))

All CF: 653
All not CF: 8794
Training data: 6612
Testing data: 2835
Training distribution: Not-CF: 6155 CF 457
Testing distribution: Not-CF: 2639 CF 196
All CF: 653
All not CF: 8794


In [17]:
class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=[0,1], y=Y)
print(class_weights)


[0.53712759 7.23353752]


In [18]:
# create TF-IDF vectorizer with n-grams
tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 3), encoding='utf-8')

# fit and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train)
print(X_train_tfidf.shape)

X_train_tfidf = sparse.hstack((X_train_tfidf, np.array(mood_train)[:,None], np.array(si_train)[:,None]))
print(X_train_tfidf.shape)


# transform the testing data using the same vectorizer
X_test_tfidf = tfidf.transform(X_test)
print(X_test_tfidf.shape)

X_test_tfidf = sparse.hstack((X_test_tfidf, np.array(mood_test)[:,None], np.array(si_test)[:,None]))
print(X_test_tfidf.shape)



(6612, 6000)
(6612, 6002)
(2835, 6000)
(2835, 6002)


In [19]:
# # create SVM classifier and fit the training data
# svm = SVC(kernel='linear', class_weight={0: class_weights[0], 1: class_weights[1]})
# svm.fit(X_train_tfidf, y_train)

# # predict on the testing data and calculate accuracy
# y_pred = svm.predict(X_test_tfidf)
# accuracy = accuracy_score(y_test, y_pred)
# print('Accuracy:', accuracy*100)

# # print classification report
# print(classification_report(y_test, y_pred))


In [20]:
from sklearn.model_selection import GridSearchCV

# define SVM classifier
svm = SVC(class_weight={0: class_weights[0], 1: class_weights[1]})

# define hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

# perform grid search
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='f1', verbose=3, n_jobs=3)
                           
grid_search.fit(X_train_tfidf, y_train)

# get best hyperparameters and accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_
print(best_params, best_accuracy)


Fitting 5 folds for each of 18 candidates, totalling 90 fits
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'} 0.5582961607410754


In [21]:
# train SVM classifier with best hyperparameters
svm = SVC(class_weight={0: class_weights[0], 1: class_weights[1]}, **best_params)
svm.fit(X_train_tfidf, y_train)

# predict on the testing data and calculate accuracy
y_pred = svm.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy*100)

# print classification report
print(classification_report(y_test, y_pred))

Accuracy: 92.7689594356261
              precision    recall  f1-score   support

           0       0.98      0.95      0.96      2639
           1       0.48      0.68      0.57       196

    accuracy                           0.93      2835
   macro avg       0.73      0.81      0.76      2835
weighted avg       0.94      0.93      0.93      2835

