**Notebook Objective:**   Explore data and build base model. 

**Problem Statement:** [Coreference Resolution] You are provided with the pronoun and two candidate names to which the pronoun could refer. 
    You must create an algorithm capable of deciding whether the pronoun refers to name A, name B, or neither.
    
Data Files:
* test_stage_1.tsv - the test set data for stage 1
* sample_submission_stage_1.csv - a file showing the correct submission format for stage 1

Columns:
* ID - Unique identifier for an example (Matches to Id in output file format)
* Text - Text containing the ambiguous pronoun and two candidate names (about a paragraph in length)
* Pronoun - The target pronoun (text)
* Pronoun-offset The character offset of Pronoun in Text 
* A - The first name candidate (text)
* A-offset - The character offset of name A in Text
* B - The second name candidate
* B-offset - The character offset of name B in Text
* URL - The URL of the source Wikipedia page for the example

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("./gap"))

# Any results you write to the current directory are saved as output.

['constants.py', 'CONTRIBUTING.md', 'gap-development.tsv', 'gap-test.tsv', 'gap-validation.tsv', 'gap_scorer.py', 'LICENSE', 'README.md', 'sample_submission_stage_1.csv', 'test_stage_1.tsv', 'test_stage_1.tsv.zip']


* test_stage_1.tsv - the training set
* sample_submission_stage_1.csv - submission file format

In [2]:
test_df = pd.read_csv("./gap/test_stage_1.tsv", delimiter='\t').rename(columns={'A': 'A_Noun', 'B': 'B_Noun'})
submission = pd.read_csv('./gap/sample_submission_stage_1.csv')

print("Test shape : ", test_df.shape)
print("Submission shape : ", submission.shape)

Test shape :  (2000, 9)
Submission shape :  (2000, 4)


In [3]:
#Peak at the data
test_df.head()

#You are provided with the pronoun and two candidate names to which the pronoun could refer. 
#You must create an algorithm capable of deciding whether the pronoun refers to name A, name B, or neither.

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A_Noun,A-offset,B_Noun,B-offset,URL
0,development-1,Zoe Telford -- played the police officer girlf...,her,274,Cheryl Cassidy,191,Pauline,207,http://en.wikipedia.org/wiki/List_of_Teachers_...
1,development-2,"He grew up in Evanston, Illinois the second ol...",His,284,MacKenzie,228,Bernard Leach,251,http://en.wikipedia.org/wiki/Warren_MacKenzie
2,development-3,"He had been reelected to Congress, but resigne...",his,265,Angeloz,173,De la Sota,246,http://en.wikipedia.org/wiki/Jos%C3%A9_Manuel_...
3,development-4,The current members of Crime have also perform...,his,321,Hell,174,Henry Rosenthal,336,http://en.wikipedia.org/wiki/Crime_(band)
4,development-5,Her Santa Fe Opera debut in 2005 was as Nuria ...,She,437,Kitty Oppenheimer,219,Rivera,294,http://en.wikipedia.org/wiki/Jessica_Rivera


In [4]:
# 1/3 probability for each column
submission.head()

Unnamed: 0,ID,A,B,NEITHER
0,development-1,0.33333,0.33333,0.33333
1,development-2,0.33333,0.33333,0.33333
2,development-3,0.33333,0.33333,0.33333
3,development-4,0.33333,0.33333,0.33333
4,development-5,0.33333,0.33333,0.33333


In [5]:
# Check for null values
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
ID                2000 non-null object
Text              2000 non-null object
Pronoun           2000 non-null object
Pronoun-offset    2000 non-null int64
A_Noun            2000 non-null object
A-offset          2000 non-null int64
B_Noun            2000 non-null object
B-offset          2000 non-null int64
URL               2000 non-null object
dtypes: int64(3), object(6)
memory usage: 140.7+ KB


In [6]:
#No null values in the data
test_df.isna().sum()

ID                0
Text              0
Pronoun           0
Pronoun-offset    0
A_Noun            0
A-offset          0
B_Noun            0
B-offset          0
URL               0
dtype: int64

In [7]:
#Getting train data from github google repo

gh_test = pd.read_csv("https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv", delimiter='\t')
gh_valid = pd.read_csv("https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv", delimiter='\t')
train = pd.concat((gh_test, gh_valid)).rename(columns={'A': 'A_Noun', 'B': 'B_Noun'}).reset_index(drop=True)
train.shape

(2454, 11)

In [8]:
train.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A_Noun,A-offset,A-coref,B_Noun,B-offset,B-coref,URL
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,http://en.wikipedia.org/wiki/Jeremy_Dehner
1,test-2,"Between the years 1979-1981, River won four lo...",him,430,Alonso,353,True,Alfredo Di St*fano,390,False,http://en.wikipedia.org/wiki/Norberto_Alonso
2,test-3,Though his emigration from the country has aff...,He,312,Ali Aladhadh,256,True,Saddam,295,False,http://en.wikipedia.org/wiki/Aladhadh
3,test-4,"At the trial, Pisciotta said: ``Those who have...",his,526,Alliata,377,False,Pisciotta,536,True,http://en.wikipedia.org/wiki/Gaspare_Pisciotta
4,test-5,It is about a pair of United States Navy shore...,his,406,Eddie,421,True,Rock Reilly,559,False,http://en.wikipedia.org/wiki/Chasers


In [9]:
#replace function

def name_replace(s, r1, r2):
    s = str(s).replace(r1,r2)
    for r3 in r1.split(' '):
        s = str(s).replace(r3,r2)
    return s

In [None]:
#Different features Extraction:

df = train

# minimum offset in a row

df['section_min'] = df[['Pronoun-offset', 'A-offset', 'B-offset']].min(axis=1)

In [None]:
#pronoun occurance offset include
#df['Pronoun-offset2'] = df['Pronoun-offset'] + df['Pronoun'].map(len)
df['A-offset2'] = df['A-offset'] + df['A_Noun'].map(len)
df['B-offset2'] = df['B-offset'] + df['B_Noun'].map(len)                               
df['section_max'] = df[['Pronoun-offset2', 'A-offset2', 'B-offset2']].max(axis=1)
df.head()

In [10]:
def name_replace(s, r1, r2):
    s = str(s).replace(r1,r2)
    #first_name and Last_name replace
    for r3 in r1.split(' '):
        s = str(s).replace(r3,r2)
    return s

def get_features(df):
    df['section_min'] = df[['Pronoun-offset', 'A-offset', 'B-offset']].min(axis=1)
    df['Pronoun-offset2'] = df['Pronoun-offset'] + df['Pronoun'].map(len)
    df['A-offset2'] = df['A-offset'] + df['A_Noun'].map(len)
    df['B-offset2'] = df['B-offset'] + df['B_Noun'].map(len)                               
    df['section_max'] = df[['Pronoun-offset2', 'A-offset2', 'B-offset2']].max(axis=1)
    #df['Text'] = df.apply(lambda r: r['Text'][: r['Pronoun-offset']] + 'pronountarget' + r['Text'][r['Pronoun-offset'] + len(str(r['Pronoun'])): ], axis=1)
    df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['A_Noun'], 'subjectone'), axis=1)
    df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['B_Noun'], 'subjecttwo'), axis=1)
    
    
    df['A-dist'] = (df['Pronoun-offset'] - df['A-offset']).abs()
    df['B-dist'] = (df['Pronoun-offset'] - df['B-offset']).abs()
    return(df)

train = get_features(train)
test = get_features(test_df)

In [11]:
train.head()

Unnamed: 0,ID,Text,Pronoun,Pronoun-offset,A_Noun,A-offset,A-coref,B_Noun,B-offset,B-coref,URL,section_min,Pronoun-offset2,A-offset2,B-offset2,section_max,A-dist,B-dist
0,test-1,Upon their acceptance into the Kontinental Hoc...,His,383,Bob Suter,352,False,Dehner,366,True,http://en.wikipedia.org/wiki/Jeremy_Dehner,352,386,361,372,386,31,17
1,test-2,"Between the years 1979-1981, River won four lo...",him,430,Alonso,353,True,Alfredo Di St*fano,390,False,http://en.wikipedia.org/wiki/Norberto_Alonso,353,433,359,408,433,77,40
2,test-3,Though his emigration from the country has aff...,He,312,Ali Aladhadh,256,True,Saddam,295,False,http://en.wikipedia.org/wiki/Aladhadh,256,314,268,301,314,56,17
3,test-4,"At the trial, subjecttwo said: ``Those who hav...",his,526,Alliata,377,False,Pisciotta,536,True,http://en.wikipedia.org/wiki/Gaspare_Pisciotta,377,529,384,545,545,149,10
4,test-5,It is about a pair of United States Navy shore...,his,406,Eddie,421,True,Rock Reilly,559,False,http://en.wikipedia.org/wiki/Chasers,406,409,426,570,570,15,153


In [12]:
# Loading NLP libraries
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
import nltk

In [18]:
#Loading string
s = train['Text'][0]
w = "subjectwo"

doc = nlp(str(s))
tokens = pd.DataFrame([[token.text, token.dep_] for token in doc], columns=['text', 'dep'])
tokens[((tokens['text']==w) & (tokens['dep']=='poss'))]
len('0')

1

In [14]:
tokens.head()

Unnamed: 0,text,dep
0,Upon,prep
1,their,poss
2,acceptance,pobj
3,into,prep
4,the,det


In [15]:
s

"Upon their acceptance into the Kontinental Hockey League, subjecttwo left Finland to sign a contract in Germany with EHC M*nchen of the DEL on June 18, 2014. After capturing the German championship with the M*nchen team in 2016, he left the club and was picked up by fellow DEL side EHC Wolfsburg in July 2016. Former NHLer Gary subjectone and Olympic-medalist subjectone are subjecttwo's uncles. His cousin is Minnesota Wild's alternate captain Ryan subjectone."

In [None]:
def get_nlp_features(s, w):
    doc = nlp(str(s))
    tokens = pd.DataFrame([[token.text, token.dep_] for token in doc], columns=['text', 'dep'])
    return len(tokens[((tokens['text']==w) & (tokens['dep']=='poss'))])

train['A-poss'] = train['Text'].map(lambda x: get_nlp_features(x, 'subjectone'))
train['B-poss'] = train['Text'].map(lambda x: get_nlp_features(x, 'subjecttwo'))
test['A-poss'] = test['Text'].map(lambda x: get_nlp_features(x, 'subjectone'))
test['B-poss'] = test['Text'].map(lambda x: get_nlp_features(x, 'subjecttwo'))

In [None]:
train = train.rename(columns={'A-coref':'A', 'B-coref':'B'})
train['A'] = train['A'].astype(int)
train['B'] = train['B'].astype(int)
train['NEITHER'] = 1.0 - (train['A'] + train['B'])

In [None]:
train.head()
train['Pronoun'].unique()

In [None]:
test['Pronoun'].unique()

In [None]:
from sklearn import *
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost import XGBClassifier

col = ['Pronoun-offset', 'A-offset', 'B-offset', 'section_min', 'Pronoun-offset2', 'A-offset2', 'B-offset2', 'section_max', 'A-poss', 'B-poss', 'A-dist', 'B-dist']
x1, x2, y1, y2 = model_selection.train_test_split(train[col].fillna(-1), train[['A', 'B', 'NEITHER']], test_size=0.2, random_state=1)
x1.head()

In [None]:
model = multiclass.OneVsRestClassifier(ensemble.RandomForestClassifier(max_depth = 7, n_estimators=1000, random_state=33))
# model = multiclass.OneVsRestClassifier(ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=100, random_state=33))

# param_dist = {'objective': 'binary:logistic', 'max_depth': 1, 'n_estimators':1000, 'num_round':1000, 'eval_metric': 'logloss'}
# model = multiclass.OneVsRestClassifier(xgb.XGBClassifier(**param_dist))

model.fit(x1, y1)
print('log_loss', metrics.log_loss(y2, model.predict_proba(x2)))
model.fit(train[col].fillna(-1), train[['A', 'B', 'NEITHER']])
results = model.predict_proba(test[col])
test['A'] = results[:,0]
test['B'] = results[:,1]
test['NEITHER'] = results[:,2]
test[['ID', 'A', 'B', 'NEITHER']].to_csv('submission.csv', index=False)

In [None]:
y1.head()

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
# params and tensor varibles

learning_rate = 0.3 # loss minimizing steps
training_epochs = 1500 # number of iterations to minimize W and b
cost_history = np.empty(shape=[1], dtype=float)  # mse values
n_dim = x1.shape[1] # Number of columns
print("ndim:", n_dim) # total columns for later use in model and parameters creation

n_class = 3
model_path = "C:\\Users\\hmnsh\\repos\\datastuff\\kaggle"

In [None]:
# Hidden layer details and neurons for each layer

n_hidden_1 = 60
n_hidden_2 = 60
n_hidden_3 = 60
n_hidden_4 = 60

x = tf.placeholder(tf.float32, [None, n_dim]) # for each row input
y_ = tf.placeholder(tf.float32, [None, n_class]) # for each row output
W = tf.Variable(tf.zeros([n_dim, n_class])) #intialized weights to zeros
b = tf.Variable(tf.zeros([n_class])) #intialized biases to zeros

In [None]:
# weights and biases for each layer

weights = {
    'h1': tf.Variable(tf.truncated_normal([n_dim, n_hidden_1])),
    'h2': tf.Variable(tf.truncated_normal([n_hidden_1, n_hidden_2])),
    'h3': tf.Variable(tf.truncated_normal([n_hidden_2, n_hidden_3])),
    'h4': tf.Variable(tf.truncated_normal([n_hidden_3, n_hidden_4])),
    'out': tf.Variable(tf.truncated_normal([n_hidden_4, n_class])),
}

biases = {
    'b1': tf.Variable(tf.truncated_normal([n_hidden_1])),
    'b2': tf.Variable(tf.truncated_normal([n_hidden_2])),
    'b3': tf.Variable(tf.truncated_normal([n_hidden_3])),
    'b4': tf.Variable(tf.truncated_normal([n_hidden_4])),
    'out': tf.Variable(tf.truncated_normal([n_class])),
}

# Initialize all variables

init = tf.global_variables_initializer()

saver = tf.train.Saver()

In [None]:
# Define the model

def multilayer_perc(x, weights, biases):
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.sigmoid(layer_1)
    # hidden layer 1 with sigmoid activation

    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.sigmoid(layer_2)
    # hidden layer 2 with sigmoid activation

    layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])
    layer_3 = tf.nn.sigmoid(layer_3)
    # hidden layer 3 with sigmoid activation

    layer_4 = tf.add(tf.matmul(layer_3, weights['h4']), biases['b4'])
    layer_4 = tf.nn.relu(layer_4)
    # hidden layer 4 with relu activation

    out_layer = tf.add(tf.matmul(layer_4, weights['out']), biases['out'])
    return out_layer

# call model

y = multilayer_perc(x, weights, biases)

# define cost function and optimizer

cost_function = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_))
training_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)

sess = tf.Session()
sess.run(init)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
# A few tiny adjustments for better code readability
sns.set(style='white', context='notebook', palette='deep')
#warnings.filterwarnings('ignore')
sns.set_style('white')
%matplotlib inline

In [None]:
# cost and accuracy - running multilayer_perceptron - training and accuracy

mse_history = []
accuracy_history = []

for epoch in range(training_epochs):
    sess.run(training_step, feed_dict={x: x1, y_: y1})
    cost = sess.run(cost_function, feed_dict={x: x1, y_: y1})
    cost_history = np.append(cost_history, cost)
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    # print( "Accuracy: ", (sess.run(accuracy, feed_dict={x: test_x, y: test_y} )))
    pred_y = sess.run(y, feed_dict={x: x2})
    mse = tf.reduce_mean(tf.square(pred_y - y2))
    mse_ = sess.run(mse)
    mse_history.append(mse_)
    accuracy = sess.run(accuracy, feed_dict={x: x1, y_: y1})
    accuracy_history.append(accuracy)

    print('epoch ', epoch, '-cost ', cost, '-mse', mse, '-Train Accuracy', accuracy)

save_path = saver.save(sess, model_path)
print('Model Saved in file: %s' % save_path)

In [None]:
# plot mse and accuracy graph

plt.plot(mse_history, 'r')
plt.show()
plt.plot(accuracy_history)
plt.show()

In [None]:
# print the final accuracy

correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print('Test Accuracy: ', (sess.run(accuracy, feed_dict={x: x2, y_: y2})))

In [None]:
# Print the final mse

pred_y = sess.run(y, feed_dict={x: x2})
mse = tf.reduce_mean(tf.square(pred_y - y2))
print('MSE: %.4f' % sess.run(mse))