In [1]:
import pickle
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

DATA_PATH = '../petite-difference-challenge2/'


train_f = [a.rstrip('\n') for a in open(DATA_PATH + 'train/in.tsv', 'r',newline = '\n').readlines()]
dev_f = [a.rstrip('\n') for a in open(DATA_PATH + 'dev-0/in.tsv', 'r',newline = '\n').readlines()]
dev1_f = [a.rstrip('\n') for a in open(DATA_PATH + 'dev-1/in.tsv', 'r',newline = '\n').readlines()]
test_f = [a.rstrip('\n') for a in open(DATA_PATH + 'test-A/in.tsv', 'r',newline = '\n').readlines()]

vectorizer = TfidfVectorizer()

train_X = vectorizer.fit_transform(train_f)
dev_X = vectorizer.transform(dev_f)
dev1_X = vectorizer.transform(dev1_f)
test_X = vectorizer.transform(test_f)

pickle.dump(vectorizer,open('vectorizer.pkl','wb'))
pickle.dump(train_X,open('train_X.pkl','wb'))
pickle.dump(dev_X,open('dev0_X.pkl','wb'))
pickle.dump(dev1_X,open('dev1_X.pkl','wb'))
pickle.dump(test_X,open('test_X.pkl','wb'))

# END OF PREPROCESS

# TRAIN MODEL

train_X = pickle.load(open('train_X.pkl','rb'))
dev0_X = pickle.load(open('dev0_X.pkl','rb'))
dev1_X = pickle.load(open('dev1_X.pkl','rb'))
test_X = pickle.load(open('test_X.pkl','rb'))

train_y = np.array([int(a.rstrip('\n')) for a in open(DATA_PATH + 'train/expected.tsv',newline = '\n').readlines()])
dev_y = np.array([int(a.rstrip('\n')) for a in open(DATA_PATH + 'dev-0/expected.tsv',newline = '\n').readlines()])

In [2]:
model = XGBClassifier(n_jobs = 8)

In [3]:
model.fit(train_X, train_y,eval_set=[(dev0_X, dev_y)])

[0]	validation_0-error:0.46616
[1]	validation_0-error:0.45582
[2]	validation_0-error:0.44989
[3]	validation_0-error:0.44742
[4]	validation_0-error:0.44283
[5]	validation_0-error:0.44296
[6]	validation_0-error:0.44014
[7]	validation_0-error:0.43645
[8]	validation_0-error:0.43516
[9]	validation_0-error:0.43395
[10]	validation_0-error:0.43324
[11]	validation_0-error:0.43274
[12]	validation_0-error:0.43142
[13]	validation_0-error:0.43004
[14]	validation_0-error:0.42873
[15]	validation_0-error:0.42701
[16]	validation_0-error:0.42576
[17]	validation_0-error:0.42510
[18]	validation_0-error:0.42337
[19]	validation_0-error:0.42219
[20]	validation_0-error:0.42113
[21]	validation_0-error:0.42072
[22]	validation_0-error:0.41946
[23]	validation_0-error:0.41932
[24]	validation_0-error:0.41796
[25]	validation_0-error:0.41744
[26]	validation_0-error:0.41667
[27]	validation_0-error:0.41598
[28]	validation_0-error:0.41562
[29]	validation_0-error:0.41487
[30]	validation_0-error:0.41418
[31]	validation_0-

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [4]:
pickle.dump(model,open('model.pkl','wb'))

### EVALUATE
predicted_y = model.predict(dev0_X)

print('dev score:')
print(accuracy_score(predicted_y, dev_y))

dev score:
0.6119623636337155


In [5]:
predicted_y = model.predict_proba(dev0_X)[:,1]
#import pdb; pdb.set_trace()
### OUTPUT
def predict(X,out_file_path):
    f_out  = open(out_file_path,'w')

    for p in model.predict_proba(X)[:,1]:
        f_out.write(str(p) + '\n')
    f_out.close()

predict(dev0_X,'dev-0/out.tsv')
predict(dev1_X,'dev-1/out.tsv')
predict(test_X,'test-A/out.tsv')