In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/contradictory-my-dear-watson/sample_submission.csv
/kaggle/input/contradictory-my-dear-watson/train.csv
/kaggle/input/contradictory-my-dear-watson/test.csv
/kaggle/input/translatedcsv/train_translated.csv
/kaggle/input/translatedcsv/test_translated.csv


In [2]:
train_data = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test_data = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

In [3]:
train_data['language'].unique()

array(['English', 'French', 'Thai', 'Turkish', 'Urdu', 'Russian',
       'Bulgarian', 'German', 'Arabic', 'Chinese', 'Hindi', 'Swahili',
       'Vietnamese', 'Spanish', 'Greek'], dtype=object)

In [4]:
from nltk.corpus import stopwords
print(stopwords.fileids())

['arabic', 'azerbaijani', 'bengali', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']


Thai, Urdu, Bulgarian, chinese, hindi, swahili, vietnamese

In [5]:
train_data = pd.read_csv("../input/translatedcsv/train_translated.csv")
test_data = pd.read_csv("../input/translatedcsv/test_translated.csv")

In [6]:
r_train_data = train_data[['premise', 'hypothesis', 'label']]
r_test_data = test_data[['premise', 'hypothesis']]

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

Prefixes and suffix can be important for identifying contradictory sentences

In [8]:
stop_words = set(stopwords.words('english'))
vectorizer = CountVectorizer(stop_words=stop_words)

In [9]:
vectorizer.fit(pd.concat([train_data['premise'], train_data['hypothesis']]))

CountVectorizer(stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...})

In [10]:
len(vectorizer.get_feature_names())



17765

In [11]:
train_premise = vectorizer.transform(train_data['premise'])
train_hypothesis = vectorizer.transform(train_data['hypothesis'])

test_premise = vectorizer.transform(test_data['premise'])
test_hypothesis = vectorizer.transform(test_data['hypothesis'])

In [12]:
print(train_premise.shape)
print(train_hypothesis.shape)
print(test_premise.shape)
print(test_hypothesis.shape)

(12120, 17765)
(12120, 17765)
(5195, 17765)
(5195, 17765)


In [13]:
from scipy.sparse import hstack

In [14]:
final_train = hstack((train_premise, train_hypothesis))
final_test = hstack((test_premise, test_hypothesis))

In [15]:
print(final_train.shape)
print(final_test.shape)

(12120, 35530)
(5195, 35530)


In [16]:
from sklearn.ensemble import GradientBoostingClassifier

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train, x_test, y_train, y_test = train_test_split(final_train, train_data['label'],
                                                    test_size=0.2, shuffle=True,
                                                    random_state=0)

In [19]:
clf = GradientBoostingClassifier(n_estimators=300, max_depth=2, learning_rate=0.03)

In [20]:
clf.fit(x_train, y_train)

GradientBoostingClassifier(learning_rate=0.03, max_depth=2, n_estimators=300)

In [21]:
print(clf.score(x_train, y_train))
print(clf.score(x_test, y_test))

0.4591584158415842
0.3882013201320132


In [22]:
# n_estimator =100, default
# 0.503197194719472
# 0.3882013201320132
# n_estimator =200
# 0.564253300330033
# 0.38283828382838286
# n_estimators=200, max_depth=2
# 0.5106229372937293
# 0.38613861386138615
# GradientBoostingClassifier(max_depth=2, n_estimators=300)
# 0.5429042904290429
# 0.3811881188118812
# GradientBoostingClassifier(learning_rate=0.01, max_depth=2, n_estimators=300)
# 0.4178011551155115
# 0.375


In [23]:
predictions = [np.argmax(i) for i in clf.predict(final_test)]

In [24]:
submission = test_data.id.copy().to_frame()
submission['prediction'] = predictions

submission.head()

Unnamed: 0,id,prediction
0,c6d58c3f69,0
1,cefcc82292,0
2,e98005252c,0
3,58518c10ba,0
4,c32b0d16df,0


In [25]:
submission.to_csv("submission.csv", index = False)