In [135]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression


In [136]:
df = pd.read_csv('data/dataset.csv', index_col=0)
df.head()

Unnamed: 0,es_score,okapi_bm25_score,okapi_tf_score,tf_idf_score,unigram_lml_score,unigram_lmjm_score,label
54_AP890105-0043,10.452501,13.428859,1.072125,3.782228,-5021.922883,-5008.00427,0
54_AP890117-0119,18.216175,30.822512,2.156571,7.506971,-4030.703824,-4010.5064,1
54_AP890118-0061,18.179998,29.973182,2.143202,7.400081,-4031.040099,-4010.678237,1
54_AP890118-0257,12.329309,14.983056,1.520388,4.390235,-3044.986408,-3014.381596,0
54_AP890121-0147,13.479685,22.624145,1.430008,5.526751,-5020.354322,-5006.538379,0


In [137]:
rs = RobustScaler()
columns = df.columns.tolist()
columns.remove('label')
df[columns] = rs.fit_transform(df[columns])
df.head()


Unnamed: 0,es_score,okapi_bm25_score,okapi_tf_score,tf_idf_score,unigram_lml_score,unigram_lmjm_score,label
54_AP890105-0043,0.797755,0.605718,0.431576,0.769853,-0.332638,-0.333874,0
54_AP890117-0119,2.724926,3.380739,2.237865,3.275017,-0.003333,-0.001533,1
54_AP890118-0061,2.715946,3.245235,2.215598,3.203126,-0.003445,-0.001591,1
54_AP890118-0257,1.263634,0.853678,1.178217,1.178782,0.324144,0.33035,0
54_AP890121-0147,1.549191,2.072753,1.027679,1.943174,-0.332117,-0.333386,0


In [138]:
columns

['es_score',
 'okapi_bm25_score',
 'okapi_tf_score',
 'tf_idf_score',
 'unigram_lml_score',
 'unigram_lmjm_score']

In [139]:
queries = ['54', '56', '57', '58', '59', '60', '61', '62', '63', '64', '68', '71', '77', '80', '85', '87', '89', '91', '93', '94', '95', '97', '98', '99', '100']
train_queries, test_queries = train_test_split(queries, test_size=0.2, random_state=42)
train_queries = set(train_queries)
test_queries = set(test_queries)
print("Training Queries:", train_queries)
print("Testing Queries:", test_queries)


Training Queries: {'60', '59', '57', '62', '56', '77', '95', '85', '98', '64', '80', '93', '94', '68', '87', '58', '61', '91', '97', '100'}
Testing Queries: {'89', '54', '71', '99', '63'}


In [140]:
train_df, test_df = df[df.index.str[:2].isin(train_queries)], df[df.index.str[:2].isin(test_queries)]

In [141]:
x_train, y_train = train_df[columns], train_df['label']
x_test, y_test = test_df[columns], test_df['label']

In [142]:
x_train.head()

Unnamed: 0,es_score,okapi_bm25_score,okapi_tf_score,tf_idf_score,unigram_lml_score,unigram_lmjm_score
56_AP890104-0107,1.052976,1.211169,1.107214,1.319019,-8.6e-05,-0.000151
56_AP890104-0250,1.052976,1.211169,1.107214,1.319019,-8.6e-05,-0.000151
56_AP890106-0247,0.671672,0.591849,0.848825,0.709953,0.328168,0.331411
56_AP890120-0253,1.179376,0.803676,1.144918,1.104195,0.328124,0.331509
56_AP890207-0226,0.574483,0.436053,0.546138,0.34724,0.328206,0.331325


In [143]:
print(y_train.value_counts())
print(y_test.value_counts())

label
0    19000
1      719
Name: count, dtype: int64
label
0    5000
1     244
Name: count, dtype: int64


In [144]:
lr = LogisticRegression(max_iter=1000, solver='liblinear', C=0.01, penalty='l1')
lr.fit(x_train, y_train)


In [145]:
res = lr.predict(x_test)
print(accuracy_score(y_test, res))


0.9530892448512586


In [146]:
def write_to_file(filename, data):
    with open(filename, 'w') as f:
        for i, (q, doc, score) in enumerate(data):
            f.write(f'{q} Q0 {doc} {i+1} {score} Exp\n')

In [147]:
# testing
probabilities = lr.predict_proba(x_test)[:, 1]
data = list(zip(x_test.index.str[:2], x_test.index.str[3:] ,probabilities))
data.sort(reverse=True, key = lambda x: x[2])
write_to_file('../Results/lr_probabilities.txt', data)

In [148]:
# training
probabilities = lr.predict_proba(x_train)[:, 1]
data = list(zip(x_train.index.str[:2], x_train.index.str[3:] ,probabilities))
data.sort(reverse=True, key = lambda x: x[2])
write_to_file('../Results/lr_probabilities_training.txt', data)

In [149]:
for i in range(5):
    train_queries, test_queries = queries[:5*i]+queries[5*(i+1):], queries[5*i:5*(i+1)]
    train_df, test_df = df[df.index.str[:2].isin(train_queries)], df[df.index.str[:2].isin(test_queries)]
    x_train, y_train = train_df[columns], train_df['label']
    x_test, y_test = test_df[columns], test_df['label']
    lr = LogisticRegression(max_iter=1000, solver='liblinear', C=0.01, penalty='l1')
    lr.fit(x_train, y_train)
    res = lr.predict(x_test)
    probabilities = lr.predict_proba(x_test)[:, 1]
    data = list(zip(x_test.index.str[:2], x_test.index.str[3:] ,probabilities))
    write_to_file('../Results/lr_probabilities_'+str(i)+'.txt', data)