In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.metrics import make_scorer, f1_score, accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support

import itertools


In [2]:
# file used to write preserve the results of the classfier
# confusion matrix and precision recall fscore matrix

def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.tight_layout()
    return plt

In [3]:
##saving the classification report
def pandas_classification_report(y_true, y_pred):
    metrics_summary = precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred)
    cm = confusion_matrix(y_true, y_pred)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    
    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average='macro'))
    avg.append(accuracy_score(y_true, y_pred, normalize=True))
    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support','accuracy']
    list_all=list(metrics_summary)
    list_all.append(cm.diagonal())
    class_report_df = pd.DataFrame(
        list_all,
        index=metrics_sum_index)

    support = class_report_df.loc['support']
    total = support.sum() 
    avg[-2] = total

    class_report_df['avg / total'] = avg

    return class_report_df.T

In [4]:
from commen_preprocess import *

.....start_cleaning.........
hashtag britain exit hashtag rape refugee


In [5]:
from sklearn.metrics import accuracy_score
import joblib
from sklearn.model_selection import StratifiedKFold as skf


###all classifier 
from catboost import CatBoostClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn import neighbors
from sklearn import ensemble
from sklearn import neural_network
from sklearn import linear_model
import lightgbm as lgbm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from nltk.classify.scikitlearn import SklearnClassifier


In [6]:
store = pd.HDFStore('store_english_test.h5')
df_english = store['df']
store = pd.HDFStore('store_hindi_test.h5')
df_hindi = store['df']
store = pd.HDFStore('store_german_test.h5')
df_german = store['df']

In [7]:
eng_task1_model = joblib.load('light_gbm_eng_task_1.joblib.pkl')
eng_task2_model = joblib.load('light_gbm_eng_task_2.joblib.pkl')
eng_task3_model = joblib.load('light_gbm_eng_task_3.joblib.pkl')
hin_task1_model = joblib.load('light_gbm_hin_task_1.joblib.pkl')
hin_task2_model = joblib.load('light_gbm_hin_task_2.joblib.pkl')
hin_task3_model = joblib.load('light_gbm_hin_task_3.joblib.pkl')
ger_task1_model = joblib.load('light_gbm_ger_task_1.joblib.pkl')
ger_task2_model = joblib.load('light_gbm_ger_task_2.joblib.pkl')


# result = loaded_model.score(X_test, Y_test)
# print(result)

In [8]:
def get_features(df):
    list_all=[]
    for i in range(len(df)):
        list_all.append(list(df['laser_commen'][i])+
                        list(df['laser_lib'][i])+
                        list(df['bert_embed'][i]))
    return list_all




In [9]:
feat_english=np.array(get_features(df_english))
feat_german=np.array(get_features(df_german))
feat_hindi=np.array(get_features(df_hindi))


In [10]:
convert_reverse_label={
    1.0:'HOF',
    0.0:'NOT'
}


def convert_submission(list1):
    list_result=[]
    for l in list1:
        list_result.append(convert_reverse_label[l])
    return list_result




In [11]:
eng_task1_predict=convert_submission(list(eng_task1_model.predict(feat_english)))
ger_task1_predict=convert_submission(list(ger_task1_model.predict(feat_german)))
hin_task1_predict=convert_submission(list(hin_task1_model.predict(feat_hindi)))


In [12]:
eng_submit_task1 = pd.DataFrame(list(zip(list(df_english['text_id']), eng_task1_predict)), 
               columns =['text_id', 'result'])
hin_submit_task1 = pd.DataFrame(list(zip(list(df_hindi['text_id']), hin_task1_predict)), 
               columns =['text_id', 'result'])
ger_submit_task1 = pd.DataFrame(list(zip(list(df_german['text_id']), ger_task1_predict)), 
               columns =['text_id', 'result'])


In [13]:
eng_submit_task1.to_csv("HateMonitors_english_task_1_run_1.tsv",sep='\t',index=False)
ger_submit_task1.to_csv("HateMonitors_germany_task_1_run_1.tsv",sep='\t',index=False)
hin_submit_task1.to_csv("HateMonitors_hindi_task_1_run_1.tsv",sep='\t',index=False)

In [14]:
df_english['label']=eng_task1_predict
df_german['label']=ger_task1_predict
df_hindi['label']=hin_task1_predict


In [15]:
df_english_next=df_english[df_english['label']=='HOF'].reset_index()
df_hindi_next=df_hindi[df_hindi['label']=='HOF'].reset_index()
df_german_next=df_german[df_german['label']=='HOF'].reset_index()


In [16]:
feat_english_23=np.array(get_features(df_english_next))
feat_german_23=np.array(get_features(df_german_next))
feat_hindi_23=np.array(get_features(df_hindi_next))


In [17]:
convert_reverse_label_task2={
    0.0:'HATE',
    1.0:'PRFN',
    2.0:'OFFN'
}


def convert_submission_task2(list1):
    list_result=[]
    for l in list1:
        list_result.append(convert_reverse_label_task2[l])
    return list_result


In [18]:
eng_task2_predict=convert_submission_task2(list(eng_task2_model.predict(feat_english_23)))
ger_task2_predict=convert_submission_task2(list(ger_task2_model.predict(feat_german_23)))
hin_task2_predict=convert_submission_task2(list(hin_task2_model.predict(feat_hindi_23)))


In [32]:
eng_intm_task2 = pd.DataFrame(list(zip(list(df_english_next['text_id']), eng_task2_predict)), 
               columns =['text_id', 'result'])
hin_intm_task2 = pd.DataFrame(list(zip(list(df_hindi_next['text_id']), hin_task2_predict)), 
               columns =['text_id', 'result'])
ger_intm_task2 = pd.DataFrame(list(zip(list(df_german_next['text_id']), ger_task2_predict)), 
               columns =['text_id', 'result'])


In [33]:
def get_actual_result(df1,df2):
        list_id = []
        list_predict =[]

        for index,row in df1.iterrows():
            if row['result']=='NOT':
                list_id.append(row['text_id'])
                list_predict.append('None')
            else:
                df=df2[df2['text_id']==row['text_id']].reset_index()
                list_id.append(row['text_id'])
                list_predict.append(df.loc[0]['result'])
        df=pd.DataFrame(list(zip(list_id, list_predict)), 
               columns =['text_id', 'result'])
        return df

        


In [35]:
eng_submit_task2 = get_actual_result(eng_submit_task1,eng_intm_task2)
hin_submit_task2 = get_actual_result(hin_submit_task1,hin_intm_task2)
ger_submit_task2 = get_actual_result(ger_submit_task1,ger_intm_task2)



In [38]:
eng_submit_task2.to_csv("HateMonitors_english_task_2_run_1.tsv",sep='\t',index=False)
ger_submit_task2.to_csv("HateMonitors_germany_task_2_run_1.tsv",sep='\t',index=False)
hin_submit_task2.to_csv("HateMonitors_hindi_task_2_run_1.tsv",sep='\t',index=False)

In [39]:
convert_reverse_label_task3={
    0:'TIN',
    1:'UNT',
}

def convert_submission_task3(list1):
    list_result=[]
    for l in list1:
        list_result.append(convert_reverse_label_task3[l])
    return list_result



In [40]:
eng_task3_predict=convert_submission_task3(list(eng_task3_model.predict(feat_english_23)))
#ger_task3_predict=convert_submission_task3(list(ger_task3_model.predict(feat_german_23)))
hin_task3_predict=convert_submission_task3(list(hin_task3_model.predict(feat_hindi_23)))

In [77]:
eng_intm_task3 = pd.DataFrame(list(zip(list(df_english_next['text_id']), eng_task3_predict)), 
               columns =['text_id', 'result'])
hin_intm_task3 = pd.DataFrame(list(zip(list(df_hindi_next['text_id']), hin_task3_predict)), 
               columns =['text_id', 'result'])
# ger_submit_task3 = pd.DataFrame(list(zip(list(df_german_next['text_id']), ger_task3_predict)), 
#                columns =['text_id', 'result'])

In [78]:
eng_submit_task3.to_csv("HateMonitors_english_task_3_run_1.tsv",sep='\t',index=False)
#ger_submit_task3.to_csv("HateMonitors_germany_task_3_run_1.tsv",sep='\t',index=False)
hin_submit_task3.to_csv("HateMonitors_hindi_task_3_run_1.tsv",sep='\t',index=False)