In [1]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.cluster import KMeans
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from datetime import datetime
import pytz
import gc


In [2]:
# Define list of dataset
dir = '../dataset/'
datasets = [file for file in Path(dir).glob('*.csv') if file.name == 'best_dataset.csv']

datasets


[WindowsPath('../dataset/best_dataset.csv')]

In [3]:
# Helper function to load data
def load_data(filename):
    print(filename)

    df = pd.read_csv(filename, header=None, encoding='utf-8').dropna()
    df.columns = ['label', 'data']

    global X, y

    X = df['data']
    y = df['label']


In [4]:
# Helper function to create train val test split
def split_dataset(X, y):
    global X_train, y_train, X_val, y_val, X_test, y_test

    # train 7 : val 2 : test 1
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=7)
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.33, random_state=7)


In [5]:
# Helper function to extract feature
def extract_feature(vectorizer, X):
    vectorizer.fit(X)

    global X_train, X_val, X_test

    X_train = vectorizer.transform(X_train).toarray()
    X_val = vectorizer.transform(X_val).toarray()
    X_test = vectorizer.transform(X_test).toarray()

    print(X_train.shape)
    print(X_val.shape)
    print(X_test.shape)


In [6]:
def useClassifier(classifier):
    try:
        print(classifier)

        classifier.fit(X_train, y_train)

        y_pred = classifier.predict(X_val)
        val_accuracy = accuracy_score(y_val, y_pred)

        print(val_accuracy)

        y_pred = classifier.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)

        print(test_accuracy)

        del classifier

        gc.collect()
    except Exception as inst:
        print('Something went wrong...')
        print(type(inst))  # the exception type
        print(inst.args)  # arguments stored in .args
        print(inst)  # __str__ allows args to be printed directly


def useAllClassifier():
    useClassifier(AdaBoostClassifier(n_estimators=50, random_state=0))
    useClassifier(AdaBoostClassifier(n_estimators=100, random_state=0))
    useClassifier(AdaBoostClassifier(n_estimators=150, random_state=0))
    useClassifier(DecisionTreeClassifier(random_state=0))
    useClassifier(DecisionTreeRegressor(random_state=0))
    useClassifier(GaussianNB())
    useClassifier(GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=0))
    useClassifier(GradientBoostingClassifier(n_estimators=100, learning_rate=0.25, random_state=0))
    useClassifier(GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, random_state=0))
    useClassifier(GradientBoostingClassifier(n_estimators=100, learning_rate=0.75, random_state=0))
    useClassifier(GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=0))
    useClassifier(KMeans(n_clusters=2, random_state=0, n_init='auto'))
    useClassifier(KNeighborsClassifier(n_neighbors=3))
    useClassifier(KNeighborsClassifier(n_neighbors=5))
    useClassifier(LogisticRegression(random_state=0))
    useClassifier(MultinomialNB())
    useClassifier(RandomForestClassifier(n_estimators=10, random_state=0))
    useClassifier(RandomForestClassifier(n_estimators=50, random_state=0))
    useClassifier(RandomForestClassifier(n_estimators=100, random_state=0))
    useClassifier(SGDClassifier(max_iter=1000, tol=1e-3))


In [7]:
X = None
y = None
X_train = None
y_train = None
X_val = None
y_val = None
X_test = None
y_test = None


In [8]:
for i, dataset in enumerate(datasets):
    print(i)
    print(datetime.now(pytz.timezone('Asia/Hong_Kong')).strftime('%Y-%m-%d %H:%M:%S'))

    load_data(dataset)

    print('CountVectorizer()')

    split_dataset(X, y)
    extract_feature(CountVectorizer(), X)
    useAllClassifier()


0
2023-06-27 11:53:03
..\dataset\best_dataset.csv
CountVectorizer()
(23082, 42852)
(6628, 42852)
(3265, 42852)
AdaBoostClassifier(random_state=0)
0.9381412190706095
0.9359877488514549
AdaBoostClassifier(n_estimators=100, random_state=0)
0.9508147254073627
0.9522205206738131
AdaBoostClassifier(n_estimators=150, random_state=0)
0.9577549788774894
0.957427258805513
DecisionTreeClassifier(random_state=0)
0.9378394689197345
0.9491577335375191
DecisionTreeRegressor(random_state=0)
Something went wrong...
<class 'ValueError'>
("Classification metrics can't handle a mix of binary and continuous targets",)
Classification metrics can't handle a mix of binary and continuous targets
GaussianNB()
0.9488533494266748
0.9464012251148545
GradientBoostingClassifier(random_state=0)
0.9260712130356065
0.9228177641653905
GradientBoostingClassifier(learning_rate=0.25, random_state=0)
0.943874471937236
0.9457886676875957
GradientBoostingClassifier(learning_rate=0.5, random_state=0)
0.9550392275196138
0.95405

In [9]:
for i, dataset in enumerate(datasets):
    print(i)
    print(datetime.now(pytz.timezone('Asia/Hong_Kong')).strftime('%Y-%m-%d %H:%M:%S'))

    load_data(dataset)

    print('TfidfVectorizer()')

    split_dataset(X, y)
    extract_feature(TfidfVectorizer(), X)
    useAllClassifier()


0
2023-06-27 20:50:27
..\dataset\best_dataset.csv
TfidfVectorizer()
(23082, 42852)
(6628, 42852)
(3265, 42852)
AdaBoostClassifier(random_state=0)
0.9367833433916717
0.9359877488514549
AdaBoostClassifier(n_estimators=100, random_state=0)
0.9518708509354254
0.9503828483920368
AdaBoostClassifier(n_estimators=150, random_state=0)
0.9580567290283645
0.9595712098009188
DecisionTreeClassifier(random_state=0)
0.944176222088111
0.9491577335375191
DecisionTreeRegressor(random_state=0)
Something went wrong...
<class 'ValueError'>
("Classification metrics can't handle a mix of binary and continuous targets",)
Classification metrics can't handle a mix of binary and continuous targets
GaussianNB()
0.9429692214846107
0.9399693721286371
GradientBoostingClassifier(random_state=0)
0.9266747133373566
0.9252679938744257
GradientBoostingClassifier(learning_rate=0.25, random_state=0)
0.9447797223898612
0.9430321592649311
GradientBoostingClassifier(learning_rate=0.5, random_state=0)
0.9532287266143633
0.9506

In [10]:
for i, dataset in enumerate(datasets):
    print(i)
    print(datetime.now(pytz.timezone('Asia/Hong_Kong')).strftime('%Y-%m-%d %H:%M:%S'))

    load_data(dataset)

    print('HashingVectorizer(n_features=50000)')

    split_dataset(X, y)
    extract_feature(HashingVectorizer(n_features=50000), X)
    useAllClassifier()


0
2023-06-28 05:38:24
..\dataset\best_dataset.csv
HashingVectorizer(n_features=50000)
(23082, 50000)
(6628, 50000)
(3265, 50000)
AdaBoostClassifier(random_state=0)
0.9363307181653591
0.9326186830015314
AdaBoostClassifier(n_estimators=100, random_state=0)
0.9494568497284248
0.9488514548238898
AdaBoostClassifier(n_estimators=150, random_state=0)
0.9568497284248643
0.9586523736600306
DecisionTreeClassifier(random_state=0)
0.9419130959565479
0.9482388973966309
DecisionTreeRegressor(random_state=0)
Something went wrong...
<class 'ValueError'>
("Classification metrics can't handle a mix of binary and continuous targets",)
Classification metrics can't handle a mix of binary and continuous targets
GaussianNB()
0.9171695835847918
0.911791730474732
GradientBoostingClassifier(random_state=0)
0.9254677127338564
0.9218989280245024
GradientBoostingClassifier(learning_rate=0.25, random_state=0)
0.9459867229933615
0.9448698315467075
GradientBoostingClassifier(learning_rate=0.5, random_state=0)
0.95337