### ReadMe

To execute the model, you have to run *main()*. In the following, *main()* is at the end of document.
##### Video
The link of video:https://drive.google.com/file/d/1DyASFI9fA9KTf-JdRDTkVtSekQ_mwSw1/view?usp=sharing

In [None]:
'''
Basic Settings
'''
import pandas as pd
import numpy as np
import gc
from keras.preprocessing import sequence
import os
import datetime



def display_set():
    np.set_printoptions(precision=5, suppress=True, linewidth=150)
    pd.set_option('display.width', 10000)
    pd.set_option('display.max_colwidth', 1000)
    pd.set_option('display.max_rows', 2000)
    pd.set_option('display.max_columns', 500)

### NLP Training
I take out the feature *full_text* and do NLP training. In train data, I collect the most frequent words, and use LSTM, RNN and GRU to train the data. Finally I use only SimpleRNN. The output is the probabilities.

In [None]:
def train_data_process(train, test, max_words=800, maxlen=100, test_amplify=1):
    train = train.fillna(" ")
    test = test.fillna(" ")
    print(np.sum(np.array(train.isnull() == True), axis=0))
    print('train[\'Target\'].unique()=', train['Target'].unique())
    X_train = train['ID'] + ' ' + train['full_text']
    y_train = train['Target']
    X_test = test['ID'] + ' ' + test['full_text']
    from keras.preprocessing.text import Tokenizer
    tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
    # ONLY GIVE MOST FREQUENT WORDS ids
    if test_amplify > 1:
        X_test = X_test.append(X_test.sample(int(len(X_test) *(test_amplify - 1)), replace=True))
    elif test_amplify < 1:
        X_test =  X_test.sample(int(len(X_test) * test_amplify))
    tokenizer.fit_on_texts(list(X_train) + list(X_test))  # tokenizer training
    X_train_tokens = tokenizer.texts_to_sequences(X_train)

    # pad ids series, make them the same length
    X_train_tokens_pad = sequence.pad_sequences(X_train_tokens, maxlen=maxlen, padding='post')
    return X_train_tokens_pad, y_train, tokenizer


def model_create(max_words, maxlen, units=64, embeddings_dim=50, clf_name='LSTM'):
    from keras.models import Model, Sequential
    from keras.layers import Embedding, LSTM, GRU, SimpleRNN, Dense
    model = Sequential()
    model.add(Embedding(input_dim=max_words,  # Size of the vocabulary
                        output_dim=embeddings_dim,  # word-embedded dimensional
                        input_length=maxlen))
    if clf_name == 'LSTM':
        model.add(LSTM(units=units))
    elif clf_name == 'GRU':
        model.add(GRU(units=units))
    elif clf_name == 'SimpleRNN':
        model.add(SimpleRNN(units=units))
    model.add(Dense(units=1, activation='sigmoid'))
    model.summary()
    return model


def train_model(model, X_train_tokens_pad, y_train, embeddings_dim, max_words, clf_name='', plotting=True):
    # Training
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy']) 
    history = model.fit(X_train_tokens_pad, y_train,
                        batch_size=128, epochs=10, validation_split=0.2)
    model.save("full_text_cat_lstm_mw{}-dim{}.h5".format(max_words, embeddings_dim))  # save models

    # plot
    if plotting:
        from matplotlib import pyplot as plt
        pd.DataFrame(history.history).plot(figsize=(8, 5))
        plt.grid(True)
        plt.title(clf_name)
        plt.show()
    return model


def one_model_train_process(stack_predict, X_train_tokens_pad, test, tokenizer, y_train, max_words, maxlen, embeddings_dim, clf, test_amplify, plotting=False):
    print('\n##################################')
    print('{}:'.format(clf))
    model1 = model_create(max_words, maxlen, units=64, embeddings_dim=embeddings_dim, clf_name=clf)
    model1 = train_model(model1, X_train_tokens_pad, y_train, embeddings_dim, max_words, plotting=plotting)
    test_tokens_pad = test_data_process(test, tokenizer, maxlen=maxlen)
    current_time = datetime.datetime.today().strftime('%Y%m%d_%H%M')
    file_name = 'tweet-use_-amplify{}-maxlen{}-clf-{}-{}'.format(test_amplify, maxlen, clf, current_time)
    test_df, pred_prob = test_result(model=model1, X_test_tokens_pad=test_tokens_pad, save_path=route,
                                     file_name=file_name)
    stack_predict[clf] = {'label': test_df, 'prob': pred_prob}
    del model1
    gc.collect()
    return stack_predict


def stacking(maxlen, embeddings_dim, max_words, route='2022_tweet/new/'):
    train_data = pd.read_csv(route + 'train_dataset.csv')

    train_data = train_data.fillna(' ')
    train = train_data[['ID', 'full_text', 'Target']]
    train['ID'] = train['ID'].apply(lambda x: str(x))
    train['full_text'] = train['full_text'].apply(lambda x: str(x))
    test_amplify = 5
    # train = train.sample(frac=1, random_state=404, replace=False)
    train = train.sample(frac=1, replace=False)
    train = train.reset_index(drop=True)
    train1 = train.loc[:2560]
    test = train.loc[2560:]
    test = test[['ID', 'full_text']]
    test_label = train.loc[2560:, ['ID', 'Target']]
    X_train_tokens_pad, y_train, tokenizer = train_data_process(train1, test=test, max_words=max_words,
                                                                maxlen=maxlen, test_amplify=test_amplify)
    stack_predict = {}
    # for clf in ['LSTM', 'GRU', 'SimpleRNN']:
    for clf in ['SimpleRNN']:
        stack_predict = one_model_train_process(stack_predict, X_train_tokens_pad, test, tokenizer, y_train, max_words, maxlen, embeddings_dim, clf, test_amplify)
    stack_predict_vote = []
    for i in range(len(test)):
        vote_ratio = np.mean([stack_predict[ml]['prob'][i] for ml in stack_predict])
        if vote_ratio > 0.5:
            stack_predict_vote.append(1)
        else:
            stack_predict_vote.append(0)
    stack_df = pd.DataFrame({'ID': [j for j in range(1, len(test) + 1)], 'Target': stack_predict_vote})
    acc_value = test_accuracy(stack_df, test_label)
    record_model_result(None, acc_value,
                        model_name='stacking-tuning-maxlen{}-maxw{}-dim{}'.format(maxlen, max_words, embeddings_dim),
                        model_path=route)
    del stack_df, X_train_tokens_pad, y_train, tokenizer
    gc.collect()


def stacking_try_para_main():
    for maxlen in [50, 100, 150, 200, 250, 300]:
        for embeddings_dim in [40, 50, 60, 70, 80]:
            for max_words in [300, 400, 500, 600, 700, 800, 900]:
                print('\n=================================================')
                print('maxlen=', maxlen, 'embeddings dim=', embeddings_dim, 'max words=', max_words)
                stacking(maxlen, embeddings_dim, max_words)


def nlp_feature(train_data, test_data, route):
    train = train_data[['ID', 'full_text', 'Target']]
    test = test_data[['ID', 'full_text']]
    print(train['Target'].unique())
    train['ID'] = train['ID'].apply(lambda x: str(x))
    train['full_text'] = train['full_text'].apply(lambda x: str(x))
    test['ID'] = test['ID'].apply(lambda x: str(x))
    test['full_text'] = test['full_text'].apply(lambda x: str(x))
    maxlen = 100
    max_words = 500
    embeddings_dim = 50
    test_amplify = 5
    train = train.sample(frac=1, random_state=404, replace=False)
    train = train.reset_index(drop=True)
    X_train_tokens_pad, y_train, tokenizer = train_data_process(train, test=test, max_words=max_words,
                                                                maxlen=maxlen, test_amplify=test_amplify)
    stack_predict = {}
    # for clf in ['LSTM', 'GRU', 'SimpleRNN']:
    for clf in ['SimpleRNN']:
        stack_predict = one_model_train_process(stack_predict, X_train_tokens_pad, test, tokenizer, y_train, max_words,
                                                maxlen, embeddings_dim, clf, test_amplify)
    # stack_predict_vote = []
    # for i in range(len(test)):
    #     vote_ratio = np.mean([stack_predict[ml]['prob'][i] for ml in stack_predict])
    #     if vote_ratio > 0.5:
    #         stack_predict_vote.append(1)
    #     else:
    #         stack_predict_vote.append(0)
    # stack_df = pd.DataFrame({'ID': [j for j in range(1, len(test) + 1)], 'Target': stack_predict_vote})
    stack_df = pd.DataFrame({'ID': [j for j in range(1, len(test) + 1)], 'Target': stack_predict['SimpleRNN']['prob']})
    # acc_value = test_accuracy(stack_df, test_label)
    current_time = datetime.datetime.today().strftime('%Y%m%d_%H%M')
    stack_df.to_csv(
        route + 'submit_stacking-use_tweet_maxlen{}-maxw{}-dim{}-{}.csv'.format(maxlen, max_words, embeddings_dim,
                                                                         current_time), index=None, line_terminator="\n")
    return stack_df

### Model: second part
After I got nlp features, I regard it as a new feature. First I do feature engineering, that is, I turn string features to integer values. Also I look for sepecail words in the feature *Additional.Comments* and make dummy features.

When training, I applied random forest classifier. I grouped the datasets by the *sample_name*. This is because I assumed the samples in same group (same *sample_name*) would have similar characteristics than between different groups.

In [None]:
def feature_engineering(train_shift1):
    for col in ['Still.Exists.x', 'Still.Exists.y', 'In.English.x', 'In.English.y', 'Sarcasm.x', 'Sarcasm.y']:
        train_shift1[col] = train_shift1[col].apply(lambda x: int(x))
    train_shift1['Additional.Comments.x_jews'] = train_shift1['Additional.Comments.x'].apply(
        lambda x: 1 if (('Jews' in x) or ('jews' in x)) else 0)
    train_shift1['Additional.Comments.x_israel'] = train_shift1['Additional.Comments.x'].apply(
        lambda x: 1 if (('israel' in x) or ('Israel' in x)) else 0)
    train_shift1['Additional.Comments.y_jews'] = train_shift1['Additional.Comments.y'].apply(
        lambda x: 1 if (('Jews' in x) or ('jews' in x)) else 0)
    train_shift1['Additional.Comments.y_israel'] = train_shift1['Additional.Comments.y'].apply(
        lambda x: 1 if (('israel' in x) or ('Israel' in x)) else 0)
    x_list = ['Still.Exists.x', 'Still.Exists.y', 'In.English.x', 'In.English.y',
              'Sarcasm.x', 'Sarcasm.y', 'Additional.Comments.x_jews', 'Additional.Comments.y_jews',
              'Additional.Comments.x_israel', 'Additional.Comments.y_israel',
              'Sentiment.Rating.x', 'Sentiment.Rating.y', 'Calling.Out.x', 'Calling.Out.y', 'Is.About.the.Holocaust.x',
              'Is.About.the.Holocaust.y', 'Is.About.The.Holocaust.x', 'Is.About.The.Holocaust.y', 'nlp_feature']
    train_X = train_shift1[x_list]
    for col in train_X.columns:
        train_X[col] = train_X[col].apply(lambda x: np.NAN if x == ' ' else x)
    train_X = train_X.fillna(0.5)
    return train_X


def group_training(train_data, train_shift):
    clf_record = {sn: np.NAN for sn in train_data['sample_name'].value_counts().keys()}
    for sn in list(train_data['sample_name'].value_counts().keys()):
        train_sample = train_shift[train_shift['sample_name'] == sn]
        train_X = feature_engineering(train_sample)
        train_y = train_sample['Target']
        from sklearn.ensemble import RandomForestClassifier
        clf = RandomForestClassifier(n_estimators=20, max_depth=3, random_state=703)
        clf.fit(train_X, train_y)
        print(sn, clf.score(train_X, train_y))
        clf_record[sn] = clf
        del clf
        gc.collect()
    return clf_record

### Evaluation Functions
These are functions for evaluation.

In [None]:
def test_data_process(data, tokenizer, maxlen=100):
    data = data.fillna(" ")
    X_data = data['ID'] + ' ' + data['full_text']
    X_data_tokens = tokenizer.texts_to_sequences(X_data)
    X_data_tokens_pad = sequence.pad_sequences(X_data_tokens, maxlen=maxlen, padding='post')
    return X_data_tokens_pad


def test_result(model, X_test_tokens_pad, save_path, file_name):
    pred_prob = model.predict(X_test_tokens_pad).squeeze()
    pred_class = np.asarray(pred_prob > 0.5).astype(np.int32)
    output = pd.DataFrame({'Id': [i for i in range(1, len(pred_class) + 1)], 'Flag': pred_class})
    output.to_csv('{}{}.csv'.format(save_path, file_name), index=None, line_terminator="\n")
    return output, pred_prob


def test_accuracy(test_result, test_label):
    from sklearn.metrics import accuracy_score
    y_pred = test_result['Target'].tolist()
    y_true = test_label['Target'].tolist()
    accuracy_value = accuracy_score(y_true, y_pred)
    print('accuracy = ', accuracy_value)
    return accuracy_value


def record_model_result(model, acc_value, model_name, model_path=''):
    if not os.path.isfile(model_path + 'model_prediction_record.npy'):
        record = {}
    else:
        record = np.load(model_path + 'model_prediction_record.npy', allow_pickle=True).item()
    record[model_name] = {'acc': acc_value}
    np.save(model_path + 'model_prediction_record.npy', record, allow_pickle=True)




def test_accuracy(test_result, test_label):
    from sklearn.metrics import accuracy_score
    y_pred = test_result['Target'].tolist()
    y_true = test_label['Target'].tolist()
    accuracy_value = accuracy_score(y_true, y_pred)
    print('accuracy = ', accuracy_value)
    return accuracy_value


### Main Function
When prediction, we use models according to samlples' *sample_name* column.

In [None]:
def main():
    display_set()
    route = '2022_tweet/new/'
    train_data = pd.read_csv(route + 'train_dataset.csv')
    test_data = pd.read_csv(route + 'test_dataset.csv')
    train_data = train_data.dropna(subset=['Target'])
    
    ####################################
    # NLP feature parameter tuning
    ###################################
    stacking_try_para_main()
    record = np.load(route + 'model_prediction_record.npy', allow_pickle=True).item()
    print(sorted(record.items(), key=lambda x: x[1]['acc'], reverse=True)[0])
    maxlen = 50
    max_words = 500
    embeddings_dim = 70
    
    ####################################
    # NLP Feature Create
    ####################################
    train_data = train_data.fillna(' ')
    test_data = test_data.fillna(' ')
    nlp_feature_data = nlp_feature(train_data, train_data, route)
    train_shift = train_data.sample(frac=1, random_state=404, replace=False)
    train_shift['nlp_feature'] = nlp_feature_data['Target'] # Probability
    
    #####################################
    # Model second part
    #####################################
    clf_record = group_training(train_data, train_shift)
    
    ######################################
    # Prediction
    ######################################
    test_nlp_feature_data = nlp_feature(train_data, test_data, route)
    test_data['nlp_feature'] = test_nlp_feature_data['Target']
    test_pred = {}
    for sn in list(train_data['sample_name'].value_counts().keys()):
        test_sample = test_data[test_data['sample_name'] == sn]
        test_X = feature_engineering(test_sample)
        clf1 = clf_record[sn]
        pred_Y = clf1.predict(test_X)
        for i in range(len(test_sample)):
            test_pred[test_sample['ID'].tolist()[i]] = pred_Y[i]

    stack_df = pd.DataFrame({'ID': test_data['ID'], 'Target': np.NAN})
    for i in stack_df.index:
        stack_df.loc[i, 'Target'] = test_pred[stack_df.loc[i, 'ID']]
    stack_df['ID'] = [i for i in range(1, len(stack_df) + 1)]
    current_time = datetime.datetime.today().strftime('%Y%m%d_%H%M')
    stack_df.to_csv(
        route + 'submit-sn_rf_use_tweet_maxlen{}-maxw{}-dim{}-{}.csv'.format(maxlen, max_words, embeddings_dim,
                                                                             current_time), index=None,
        line_terminator="\n")


#############################################################
#############################################################
if __name__ == '__main__':
    main()