In [92]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import pickle

In [83]:
df = pd.read_csv("./training_csvs/feb-mar-apr.csv")
df.columns = ["transaction_description", "category"]
df["transaction_description"]

0               DISCOVERY COFFEE
1           FLW MADE IN COOKWARE
2         AMZN Mktp CA*HF5J38GN1
3         AMZN Mktp CA*HF5US1E72
4           CHATGPT SUBSCRIPTION
                 ...            
171               MEUNDIES, INC.
172    PHARMASAVE JAMES BAY #101
173          THRIFTY FOODS #9454
174    WAL-MART SUPERCENTER#1214
175          THRIFTY FOODS #9465
Name: transaction_description, Length: 176, dtype: object

In [88]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df['transaction_description'], df['category'], test_size=0.33, random_state=42
)

# Feature extraction using TfidfVectorizer
# vectorizer = TfidfVectorizer()
vectorizer = TfidfVectorizer(min_df=2, max_df=0.95, ngram_range=(1, 2), max_features=1000)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Define hyperparameter distribution
param_dist = {
    'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1],
    'max_iter': [1000, 2000, 3000],
    'tol': [1e-3, 1e-4, 1e-5],
}

param_dist['learning_rate'] = ['optimal', 'constant', 'invscaling', 'adaptive']
param_dist['eta0'] = [1e-4, 1e-3, 1e-2, 1e-1, 1]


# Create an SGDClassifier instance
sgd_clf = SGDClassifier(random_state=42)

# Create a RandomizedSearchCV instance
random_search = RandomizedSearchCV(
    sgd_clf, param_dist, scoring='accuracy', cv=5, n_iter=500, n_jobs=-1, verbose=1, random_state=42
)


In [89]:
# Train the RandomizedSearchCV instance on the preprocessed training data
random_search.fit(X_train_vec, y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits




RandomizedSearchCV(cv=5, estimator=SGDClassifier(random_state=42), n_iter=500,
                   n_jobs=-1,
                   param_distributions={'alpha': [0.0001, 0.001, 0.01, 0.1],
                                        'eta0': [0.0001, 0.001, 0.01, 0.1, 1],
                                        'learning_rate': ['optimal', 'constant',
                                                          'invscaling',
                                                          'adaptive'],
                                        'loss': ['hinge', 'log',
                                                 'modified_huber',
                                                 'squared_hinge',
                                                 'perceptron'],
                                        'max_iter': [1000, 2000, 3000],
                                        'penalty': ['l2', 'l1', 'elasticnet'],
                                        'tol': [0.001, 0.0001, 1e-05]},
                   random_

In [90]:
# Check the best combination of hyperparameters and the corresponding score
print("Best hyperparameters: ", random_search.best_params_)
print("Best accuracy score: ", random_search.best_score_)

Best hyperparameters:  {'tol': 0.001, 'penalty': 'l2', 'max_iter': 3000, 'loss': 'squared_hinge', 'learning_rate': 'optimal', 'eta0': 0.1, 'alpha': 0.1}
Best accuracy score:  0.7351449275362318


In [93]:
# Save the model
with open('sgd_classifier.pkl', 'wb') as f:
    pickle.dump(random_search.best_estimator_, f)

# Save the vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    df['transaction_description'], 
    df['category'], 
    test_size=0.33, 
    random_state=42
)

In [42]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [43]:
svm_clf = SVC(kernel='linear', C=1, random_state=42)
svm_clf.fit(X_train_vec, y_train)

y_pred_svm = svm_clf.predict(X_test_vec)
print(classification_report(y_test, y_pred_svm, zero_division=1))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))

                precision    recall  f1-score   support

   coffee shop       1.00      1.00      1.00         2
     going out       0.35      1.00      0.52        11
     groceries       0.67      0.40      0.50         5
        health       1.00      0.00      0.00         2
       payment       1.00      1.00      1.00         3
      shopping       0.60      0.50      0.55         6
 subscriptions       0.83      0.71      0.77        14
transportation       1.00      0.14      0.25         7
        travel       1.00      0.29      0.44         7
     utilities       1.00      0.00      0.00         2

      accuracy                           0.58        59
     macro avg       0.85      0.50      0.50        59
  weighted avg       0.77      0.58      0.55        59

Accuracy: 0.576271186440678


In [76]:
param_grid = {
    'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1],
    'max_iter': [1000, 2000, 3000],
    'tol': [1e-3, 1e-4, 1e-5],
}

sgd_clf = SGDClassifier(random_state=42)

grid_search = GridSearchCV(sgd_clf, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

In [77]:
grid_search.fit(X_train_vec, y_train)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits




GridSearchCV(cv=5, estimator=SGDClassifier(random_state=42), n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1],
                         'loss': ['hinge', 'log', 'modified_huber',
                                  'squared_hinge', 'perceptron'],
                         'max_iter': [1000, 2000, 3000],
                         'penalty': ['l2', 'l1', 'elasticnet'],
                         'tol': [0.001, 0.0001, 1e-05]},
             scoring='accuracy', verbose=1)

In [78]:
print("Best hyperparameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

y_pred_gs = grid_search.best_estimator_.predict(X_test_vec)
print(classification_report(y_test, y_pred_gs, zero_division=1))
print("Accuracy:", accuracy_score(y_test, y_pred_gs))

Best hyperparameters:  {'alpha': 0.001, 'loss': 'log', 'max_iter': 1000, 'penalty': 'l2', 'tol': 1e-05}
Best accuracy score:  0.7351449275362318
                precision    recall  f1-score   support

   coffee shop       1.00      1.00      1.00         2
     going out       0.37      0.91      0.53        11
     groceries       0.75      0.60      0.67         5
        health       1.00      0.00      0.00         2
       payment       1.00      1.00      1.00         3
      shopping       0.50      0.50      0.50         6
 subscriptions       0.83      0.71      0.77        14
transportation       0.67      0.29      0.40         7
        travel       1.00      0.29      0.44         7
     utilities       1.00      0.00      0.00         2

      accuracy                           0.59        59
     macro avg       0.81      0.53      0.53        59
  weighted avg       0.73      0.59      0.57        59

Accuracy: 0.5932203389830508


In [None]:
random_search = RandomizedSearchCV(
    sgd_clf, param_grid, scoring='accuracy', cv=5, n_iter=50, n_jobs=-1, verbose=1, random_state=42
)

In [None]:
random_search.fit(X_train_vec, y_train)

In [None]:
print("Best hyperparameters: ", random_search.best_params_)
print("Best accuracy score: ", random_search.best_score_)
y_pred_rs = random_search.best_estimator_.predict(X_test_vec)
print(classification_report

In [74]:
sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=5e-3, random_state=42, max_iter=10_000, tol=1e-3)
sgd_clf.fit(X_train_vec, y_train)

y_pred_sgd = sgd_clf.predict(X_test_vec)
print(classification_report(y_test, y_pred_sgd, zero_division=1))
print("Accuracy:", accuracy_score(y_test, y_pred_sgd))

                precision    recall  f1-score   support

   coffee shop       1.00      1.00      1.00         2
     going out       0.37      0.91      0.53        11
     groceries       0.75      0.60      0.67         5
        health       1.00      0.00      0.00         2
       payment       1.00      1.00      1.00         3
      shopping       0.50      0.50      0.50         6
 subscriptions       1.00      0.71      0.83        14
transportation       0.67      0.29      0.40         7
        travel       1.00      0.43      0.60         7
     utilities       1.00      0.50      0.67         2

      accuracy                           0.63        59
     macro avg       0.83      0.59      0.62        59
  weighted avg       0.77      0.63      0.63        59

Accuracy: 0.6271186440677966
