In [1]:
import numpy as np
import pandas as pd
import random
import copy
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data
import torch.utils.data as d
import tokenization_dim_reduction as tdr
import ngrams as ng

data_dir = r'D:\Researching Data\Youtube data\USvideos.csv'
torch.backends.cudnn.deterministic = True
TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [2]:
_, dtext, dlabel = tdr.select_col(data_dir, tdr.cols_t4)
new_TEXT = tdr.combine_text(dtext, 1, [0,2])
#new_label = tdr.multi_to_binary(dlabel, 25) # politics
new_label = tdr.multi_to_binary(dlabel, 24) # entertainments
new_arr = np.concatenate((new_TEXT.reshape([len(new_TEXT),1]), new_label), axis=1)

In [3]:
txt_tfidf = tdr.tfidf_tokenization(new_TEXT)
#torch_tfidf = torch.from_numpy(txt_tfidf.toarray())
new_TEXT = txt_tfidf.toarray()
new_label = tdr.multi_to_binary(dlabel, 24)
new_arr = np.concatenate((new_TEXT, new_label), axis=1)

In [4]:
# select 5000 words with highest average tfidf
top5k_indices = np.argsort(np.apply_along_axis(np.mean, 0, new_TEXT))[-5000:]
new_TEXT = new_TEXT[:, top5k_indices]
new_arr = np.concatenate((new_TEXT, new_label), axis=1)
print("the current shape of the reduced data is ", new_TEXT.shape)

the current shape of the reduced data is  (6351, 5000)


In [5]:
def dimensional_reduction(df, k, get_test_df=False, test_df=""):
    '''
    The function is designed to make dimensional reduction with SVD method
    Inputs:
        df: original matrix
        k: the number of singular values taken
        y: label vector
        allm: if True, return the components of economic SVD
    Returns: approximated df with k singular values
    '''
    U_k = np.linalg.svd(df)[0][:, :k]
    sigma_k = np.linalg.svd(df)[1][:k]
    Vt_k = np.linalg.svd(df)[2][:k, :]

    reduced_df = (Vt_k[:k, :].dot(df.T)).T

    if get_test_df:
        return reduced_df, (Vt_k[:k, :].dot(test_df.T)).T
    
    return reduced_df

In [6]:
# split train, validation, test
def split_train_test(dt_size, train_valid_test_r):
    '''
    The function randomly selects the indices for
    training, validation, and testing sets
    Inputs:
        dt_size: number of rows
        train_valid_test_r: tuple of ratios
    Return: indices for each subset
    '''
    train_size = int(dt_size * train_valid_test_r[0] // 1)
    valid_size = int(dt_size * train_valid_test_r[1] // 1)
    test_size = int(dt_size - train_size - valid_size)
    print("the size of train, valid and test data are", train_size, valid_size, test_size)
    
    full_indices = np.arange(0, dt_size, 1)
    train_indices = np.random.permutation(full_indices)[:train_size]
    
    sub_indices = set(full_indices) - set(train_indices)
    valid_indices = np.random.permutation(list(sub_indices))[:valid_size]
    
    sub_indicest = set(sub_indices) - set(valid_indices)
    test_indices = np.array(list(sub_indicest))
    
    return train_indices, valid_indices, test_indices 

In [7]:
def split_reduced_data(path, arr, y, k=500, train_valid_test_r=(0.4, 0.4, 0.2)):
    '''
    The function split the data to train, validation and test
    sets with randomly selected indices and save them to seperated
    csv files
    Inputs:
        path: directory of the saved files
        arr: the whole dataset
        train_valid_test_r: tuple of ratios
    '''
    train_indices, valid_indices, test_indices = split_train_test(arr.shape[0], train_valid_test_r)
    
    red_train, red_valid = dimensional_reduction(arr[train_indices], k, True, arr[valid_indices])
    red_train, red_test = dimensional_reduction(arr[train_indices], k, True, arr[test_indices])
    
    pd.DataFrame(red_train).to_csv(path + "\\train.csv", header=None, index=None)
    pd.DataFrame(red_valid).to_csv(path + "\\valid.csv", header=None, index=None)
    pd.DataFrame(red_test).to_csv(path + "\\test.csv", header=None, index=None)
    
    return (red_train, y[train_indices], red_valid, y[valid_indices], 
            red_test, y[test_indices])

In [8]:
path = r'D:\Researching Data\Youtube data\tfidf'
X_train, y_train, X_valid, y_valid, X_test, y_test =\
split_reduced_data(path, new_TEXT, new_label, 500, train_valid_test_r=(0.4, 0.4, 0.2))

the size of train, valid and test data are 2540 2540 1271


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid

parameter_dict = {"random_forest":{'n_estimators': [1, 10,100], 'max_depth': [5,50],
                              'max_features': ['sqrt','log2'],'min_samples_split': [2,10]},
             "logistics": {'penalty': ["l1",'l2'], 'C': [0.001,0.1,1]}}

clfs = {'random_forest': RandomForestClassifier(n_jobs=2),
        'logistics': LogisticRegression(C=1e5, solver='liblinear')}

In [10]:
def classifier_developer(method, clfs, parameter_dict):
    '''
    This function is used to generate possible combination of hyperparameters of
    given classifiers
    Inputs:
        method: specific classifiers
        clfs: dictionary of classifiers
        parameter_dict: parameters dictionary for the classifiers
    Returns: list of all possible combination of parameters
    '''
    parameters = parameter_dict[method]
    para_list = ParameterGrid(parameters)

    return para_list

In [11]:
def eval_acc(y_pred, y_real):
    '''
    The helper function to evaluate the accuracy
    of true label and predicted probability
    '''
    pdy1 = np.zeros(len(y_pred))
    pdy1[y_pred >= 0.5] = 1
    pdy2 = np.zeros(len(y_pred))
    pdy2[pdy1 == y_real[:, 0]] = 1
    
    return sum(pdy2) / len(pdy2)

In [12]:
def ml_evaluate(clfs, parameter_dict, X_train, y_train, X_valid, y_valid,
                X_test, y_test):
    '''
    The function evaluated the performance of each machine learning
    model type and with the hyper-parameters defined in the parameter-
    dict. The best accuracy of each model type is presented in the
    output.
    Inputs:
        clfs: dictionary of classifiers
        parameter_dict: dictionary of parameters for each classifier
        X_train, y_train, X_valid, y_valid, X_test, y_test
    Returns: dictionary with model type, parameters and best accuracy
    '''
    outputs_dict = {}
    for method, _ in clfs.items():
        if method in parameter_dict:
            print("operation of {} method begins".format(method))
            para_list = classifier_developer(method, clfs, parameter_dict)
            best_acc = 0
            for para in para_list:
                clf = clfs[method].set_params(**para)
                model_name = method + " with parameters : " + str(clf.get_params())
                model = clf.fit(X_train, y_train[:, 0])
                
                y_v_predp = model.predict_proba(X_valid)[:, 1]
                y_t_predp = model.predict_proba(X_test)[:, 1]
                acc_v = eval_acc(y_v_predp, y_valid)
                acc_t = eval_acc(y_t_predp, y_test)

                if acc_v > best_acc:
                    best_acc = acc_t
                    best_model = model_name
            
            outputs_dict[method] = (best_model, best_acc)
            print("the best accuracy for method: ", method,  " is ", best_acc)

    return outputs_dict

In [14]:
ml_evaluate(clfs, parameter_dict, X_train, y_train, X_valid, y_valid, X_test, y_test)

operation of random_forest method begins
the best accuracy for method:  random_forest  is  0.8387096774193549
operation of logistics method begins
the best accuracy for method:  logistics  is  0.8394964594807238


{'random_forest': ("random_forest with parameters : {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 50, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': 2, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}",
  0.8387096774193549),
 'logistics': ("logistics with parameters : {'C': 1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l1', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}",
  0.8394964594807238)}