In [None]:
import json
import numpy as np
import pandas as pd
from data_tool import loadData
import data_tool
import joblib
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import random
from sklearn.metrics import f1_score
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression,SGDClassifier 
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif
from sklearn.pipeline import make_pipeline
import csv

In [None]:
seed = 999
random.seed(seed)
np.random.seed(seed)
#path
feaDir = './fea_sel/fea_elmo.pkl'
labDir = './fea_sel/lab6.pkl'
modPath = './model/'
#params
gbm_params = {
    'objective': 'multiclass',
    'num_class':8,
    'metric': 'multi_logloss',
    'boosting': 'gbdt',
    'num_leaves': 30,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 6,
    'learning_rate': 0.01,
    'num_iterations': 2000,
    'verbose': 1,
    'device': 'cpu',
    'n_jobs': 6,
#    'gpu_platform_id': 0,
#    'gpu_device_id': 0
}

In [None]:
def ML(fea_train,fea_test, lab_train, lab_test,method):
    
    if method == 'svm': 
        clf = SVC(gamma='scale',C = 5)
        clf.fit(fea_train,lab_train)    
    elif method == 'LR':
        clf = LogisticRegression()
        clf.fit(fea_train,lab_train)    
    elif method == 'NB':
        clf = MultinomialNB()
        clf.fit(fea_train,lab_train)
    elif method == 'RF':
        clf=RandomForestClassifier(criterion='entropy') 
        clf.fit(fea_train,lab_train)
    elif method == 'gdbt':
        clf=None
        lgb_train = lgb.Dataset(x_train, y_train)
        clf = lgb.train(gbm_params, lgb_train, valid_sets=lgb_train, init_model=clf)    
    
    pred = clf.predict(fea_test)
    if method =='gdbt':
        pred = np.argmax(pred,axis=1)
    gnd = lab_test
    return pred,gnd   

def writer_csv(logPath, logging):
    f = open(logPath,'a')
    w = csv.writer(f,lineterminator = '\r')
    w.writerow(logging)
    f.close() 

In [None]:
# =============================================================================
#     0. Loading feature and split
# =============================================================================
# train = joblib.load('./fea_sel/text/fea-2_tr.pkl')
tfidf_tr = joblib.load('./fea_sel/tfidf/tfidf_train-832.pkl')
bow_tr = joblib.load('./fea_sel/BOW/bow_train-832.pkl')
label8 = joblib.load('./fea_sel/label/label8_ver2.pkl')
#    label8=label8[0].apply(lambda x:int(x))
#    leng = 1000

FEATURE = {'tfidf':tfidf_tr,'bow':bow_tr}
LABEL = {'lab8':label8}
best_result ={'f1':0}
for fea_name, feature in FEATURE.items():
    for lab_name, label in LABEL.items():
        leng=100000
#            feature = feature[:leng]
#            label = label[:leng]
        x_train,x_valid,y_train,y_valid = train_test_split(feature,label, random_state =seed,test_size = 0.4 )
        clf = SGDClassifier(loss='squared_loss',max_iter=1000,tol=1e-3,n_jobs=6, \
                            early_stopping=True,learning_rate='adaptive',eta0=0.001,penalty='elasticnet',class_weight="balanced")
        # for pc in range(5, 25, 5):

            # clf = SVC(gamma='scale',C = 5)
            # pipe_clf = make_pipeline(SelectPercentile(f_classif, percentile=pc/100),clf)
            # pipe_clf.fit(x_train, y_train)
            # print('Now Selection Percentile is : ', pc)
            # score = pipe_clf.score(x_valid, y_valid)
            # pred = pipe_clf.predict(x_valid)
            # gnd = y_valid
            # cm = confusion_matrix(gnd, pred)
#            clf = SVC(gamma='scale',C = 5)
        clf.fit(x_train, y_train)
        pred = clf.predict(x_valid)
        gnd = y_valid
        cm = confusion_matrix(gnd, pred)
    # =============================================================================
    #     1. ML
    # =============================================================================
        # METHOD = ['svm','LR','NB','RF', 'gdbt']
        # for i,method in enumerate(METHOD):
        #     if i < 5:
        #         print('======== ML: {} ======'.format(method))
        #         pred,gnd = ML(x_train,x_valid,y_train,y_valid,method = method)
        #     else:
        #         print('======== DL: {} ======',format(method))
        #         # ==== 2. DNN ====

        #         # ==== 3. LSTM ====

        f1_list = data_tool.eachAccu(pred,gnd)
        f1_overall = f1_score(pred,gnd,average='micro')
        print('\nf1_overall:{:.4}'.format(f1_overall))
        if f1_overall>best_result['f1']:
            best_result['f1'] = f1_overall
            # best_result['model'] = method
            best_result['feature'] = fea_name
            best_result['label'] = lab_name
            # best_result['pc'] = pc
        #======== Test Subimission ====#
        test_data = joblib.load('./fea_sel/BOW/bow_test-832.pkl')
        testID = joblib.load('./fea_sel/test/testID.pkl')
        pred = clf.predict(test_data)
        # {'0':'sadness' ,'1':'disgust' ,'2':'anticipation', '3':'joy' ,'4':'trust' ,'5':'anger', '6':'fear','7':'surprise'}
        logg = './submission2.csv'
        writer_csv(logg, ['id','emotion'])
        for id_,emo in zip(testID,pred):
            if emo == 0: w = 'sadness'
            elif emo == 1: w = 'disgust'
            elif emo == 2: w = 'anticipation'
            elif emo == 3: w = 'joy'
            elif emo == 4: w = 'trust'
            elif emo == 5: w = 'anger'
            elif emo == 6: w = 'fear'
            elif emo == 7: w = 'surprise'
            ww= [id_,w]
            writer_csv(logg, ww)