In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack
import matplotlib.pyplot as plt 
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam
from keras.utils import to_categorical

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input/"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
import chardet
with open('<Datafile>', 'rb') as f:
    result = chardet.detect(f.read())  # or readline if the file is large

train_df_org = pd.read_csv("<DataFile>", encoding=result['encoding'])
train_df_org = train_df_org.sample(frac=1).reset_index(drop=True)

In [None]:
EMBEDDING_FILE = '<DataFolder>/glove-840b-300d/glove.840B.300d.txt' 

embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


In [None]:

def processinput(train_df,catname = 'Area'):
    train_df = train_df.dropna(subset=[catname])
    train_df = train_df.reset_index(drop=True)
    train_df[catname] = pd.Categorical(train_df[catname])
    train_df['AreaCode'] = train_df[catname].cat.codes
    train_df['DescSum'] = train_df['Summary'] + ' ' + train_df['Description']
    train_ret = train_df[1000:]
    val_ret = train_df[:1000]
    return train_ret, val_ret 


def processtokenizer(ycat = 'Area', xcat = 'DescSum'):
    train_o, val_o  = processinput(train_df_org,catname = ycat)
    train_text_data_o = train_o[xcat]
    train_y_o = train_o['AreaCode']
    AreaCategories_o = np.array(train_o[ycat].cat.categories)

    train_y_bin_o = to_categorical(train_y_o)
    numofcat_o = train_y_bin_o.shape[1]

    if (xcat == 'DescSum'):
        max_features_o=35000
        maxlen_o=300
        embed_size_o=300
    elif xcat == 'Summary':
        max_features_o=10000
        maxlen_o=150
        embed_size_o=300
    elif xcat == 'Description':
        max_features_o=30000
        maxlen_o=250
        embed_size_o=300
    else:
        max_features_o=10000
        maxlen_o=150
        embed_size_o=300

    tokenizer_o = text.Tokenizer(num_words=max_features_o, lower=True)

    tokenizer_o.fit_on_texts(list(train_text_data_o))

    train_x_o = tokenizer_o.texts_to_sequences(train_text_data_o)

    train_x_pad_o = sequence.pad_sequences(train_x_o, maxlen=maxlen_o)

    return train_o, val_o, AreaCategories_o, train_x_pad_o, train_y_bin_o, numofcat_o, tokenizer_o,  max_features_o, maxlen_o, embed_size_o

def deepmodel(maxlen_o, embed_size_o, num_words_o, embedding_matrix_o, numofcat_o ):
    inp = Input(shape=(maxlen_o,))
    x = Embedding(num_words_o, embed_size_o, weights=[embedding_matrix_o], trainable=False)(inp)
    x = SpatialDropout1D(0.10)(x)
    x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.10, recurrent_dropout=0.10))(x)
    x = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(x)
    x = Conv1D(64, kernel_size=3, padding='valid', kernel_initializer='glorot_uniform')(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool])
    out = Dense(numofcat_o, activation='sigmoid')(x)
    model = Model(inp, out)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def predictarea(txt_o,tokenizer_o, model_o,maxlen_o,val_df,noofcat_o):
    txt_y = tokenizer_o.texts_to_sequences(txt_o)
    txt_y_pad = sequence.pad_sequences(txt_y, maxlen=maxlen_o)
    prediction = model_o.predict(txt_y_pad,batch_size=1,verbose=1)
    y_val = val_df['AreaCode']
    y_val_bin = to_categorical(y_val, num_classes=noofcat_o)
    test_metrics = model_o.evaluate(x=txt_y_pad, y=y_val_bin, batch_size=10, verbose=1)
    print(test_metrics)
    return prediction

def predtodf(predictions,val_df, AreaCategories_o):
    Pred_arg = np.argsort(predictions)[:,-3:]
    Pred_arg_ar = [[x] for x in Pred_arg] 
    Pred_results = pd.DataFrame(Pred_arg_ar)
    Pred_results['Prob'] = [predictions[i,[Pred_arg[i]]] for i in range(len(Pred_arg))]
    Pred_results['cat'] = [AreaCategories_o[x] for x in Pred_arg]
    Pred_results['Description'] = val_df['Description']
    Pred_results['trueArea'] = val_df['Area']
    Pred_results['trueSubArea'] = val_df['Sub Area']
    Pred_results['truelabelcat'] = val_df['AreaCode'] 
    Pred_results['SR'] = val_df['SR #'] 
    Pred_results['Summary'] = val_df['Summary'] 
    return Pred_results

def trainandgetpred(ycat_o, xcat_o):
    train, val, AreaCategories, train_x_pad, train_y_bin, numofcat, tokenizer, max_features, maxlen, embed_size = processtokenizer(ycat_o, xcat_o)
    word_index = tokenizer.word_index
    num_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.zeros((num_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue

        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    model = deepmodel(maxlen, embed_size, num_words, embedding_matrix, numofcat)
    X_tra, X_val, y_tra, y_val = train_test_split(train_x_pad, train_y_bin, train_size=0.99, random_state=233)
    model.fit(X_tra, y_tra, batch_size=10, epochs=5, validation_split=0.1, verbose=1)
    txt = val[xcat_o]
    predresults = predictarea(txt,tokenizer, model,maxlen,val,numofcat)
    result_df = predtodf(predresults,val, AreaCategories)
    return result_df



In [None]:
Area_DescSum = trainandgetpred('Area', 'DescSum')
Area_DescSum.to_csv('Area_DescSum.csv')


In [None]:
Area_Desc = trainandgetpred('Area', 'Description')
Area_Desc.to_csv('Area_Desc.csv')

In [None]:
Area_Sum = trainandgetpred('Area', 'Summary')
Area_Sum.to_csv('Area_Sum.csv')

In [None]:
SArea_DescSum = trainandgetpred('Sub Area', 'DescSum')
SArea_DescSum.to_csv('SArea_DescSum.csv')

In [None]:
SArea_Desc = trainandgetpred('Sub Area', 'Description')
SArea_Desc.to_csv('SArea_Desc.csv')

In [None]:
SArea_Sum = trainandgetpred('Sub Area', 'Summary')
SArea_Sum.to_csv('SArea_Sum.csv')