In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.cluster import KMeans
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from datetime import datetime
import pytz
import gc


In [None]:
# Define list of dataset
dir = '../dataset/'
datasets = [file for file in Path(dir).glob('*.csv') if file.name == 'best_dataset.csv']

datasets


In [None]:
# Helper function to load data
def load_data(filename):
    print(filename)

    df = pd.read_csv(filename, header=None, encoding='utf-8').dropna()
    df.columns = ['label', 'data']

    global X, y

    X = df['data']
    y = df['label']


In [None]:
# Helper function to create train val test split
def split_dataset(X, y):
    global X_train, y_train, X_val, y_val, X_test, y_test

    # train 7 : val 2 : test 1
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=7)
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.33, random_state=7)


In [None]:
# Helper function to extract feature
def extract_feature(vectorizer, X):
    vectorizer.fit(X)

    global X_train, X_val, X_test

    X_train = vectorizer.transform(X_train).toarray()
    X_val = vectorizer.transform(X_val).toarray()
    X_test = vectorizer.transform(X_test).toarray()

    print(X_train.shape)
    print(X_val.shape)
    print(X_test.shape)


In [None]:
def useClassifier(classifier):
    try:
        print(classifier)

        classifier.fit(X_train, y_train)

        y_pred = classifier.predict(X_val)
        val_accuracy = accuracy_score(y_val, y_pred)

        print(val_accuracy)

        y_pred = classifier.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)

        print(test_accuracy)

        del classifier

        gc.collect()
    except Exception as inst:
        print('Something went wrong...')
        print(type(inst))  # the exception type
        print(inst.args)  # arguments stored in .args
        print(inst)  # __str__ allows args to be printed directly


def useAllClassifier():
    useClassifier(AdaBoostClassifier(n_estimators=50, random_state=0))
    useClassifier(AdaBoostClassifier(n_estimators=100, random_state=0))
    useClassifier(AdaBoostClassifier(n_estimators=150, random_state=0))
    useClassifier(DecisionTreeClassifier(random_state=0))
    useClassifier(DecisionTreeRegressor(random_state=0))
    useClassifier(GaussianNB())
    useClassifier(GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=0))
    useClassifier(GradientBoostingClassifier(n_estimators=100, learning_rate=0.25, random_state=0))
    useClassifier(GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, random_state=0))
    useClassifier(GradientBoostingClassifier(n_estimators=100, learning_rate=0.75, random_state=0))
    useClassifier(GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=0))
    useClassifier(KMeans(n_clusters=2, random_state=0, n_init='auto'))
    useClassifier(KNeighborsClassifier(n_neighbors=3))
    useClassifier(KNeighborsClassifier(n_neighbors=5))
    useClassifier(LogisticRegression(random_state=0))
    useClassifier(MultinomialNB())
    useClassifier(RandomForestClassifier(n_estimators=10, random_state=0))
    useClassifier(RandomForestClassifier(n_estimators=50, random_state=0))
    useClassifier(RandomForestClassifier(n_estimators=100, random_state=0))
    useClassifier(SGDClassifier(max_iter=1000, tol=1e-3))


In [None]:
X = None
y = None
X_train = None
y_train = None
X_val = None
y_val = None
X_test = None
y_test = None


In [None]:
for i, dataset in enumerate(datasets):
    print(i)
    print(datetime.now(pytz.timezone('Asia/Hong_Kong')).strftime('%Y-%m-%d %H:%M:%S'))

    load_data(dataset)

    print('CountVectorizer()')

    split_dataset(X, y)
    extract_feature(CountVectorizer(), X)
    useAllClassifier()


In [None]:
for i, dataset in enumerate(datasets):
    print(i)
    print(datetime.now(pytz.timezone('Asia/Hong_Kong')).strftime('%Y-%m-%d %H:%M:%S'))

    load_data(dataset)

    print('TfidfVectorizer()')

    split_dataset(X, y)
    extract_feature(TfidfVectorizer(), X)
    useAllClassifier()


In [None]:
for i, dataset in enumerate(datasets):
    print(i)
    print(datetime.now(pytz.timezone('Asia/Hong_Kong')).strftime('%Y-%m-%d %H:%M:%S'))

    load_data(dataset)

    print('HashingVectorizer(n_features=50000)')

    split_dataset(X, y)
    extract_feature(HashingVectorizer(n_features=50000), X)
    useAllClassifier()
