# IHLT Final Project: Semantinc Textual Similarity
Jordi Armengol - Joan LLop

## Data collection
We start by downloading the SemEval 2012 dataset.

In [1]:
!mkdir -p data
!wget https://gebakx.github.io/ihlt/sts/resources/train.tgz --directory-prefix=data
!wget https://gebakx.github.io/ihlt/sts/resources/test-gold.tgz --directory-prefix=data
%cd data
!tar zxvf train.tgz
!tar zxvf test-gold.tgz
%cd ..

--2019-12-06 13:39:06--  https://gebakx.github.io/ihlt/sts/resources/train.tgz
Resolving gebakx.github.io (gebakx.github.io)... 185.199.111.153, 185.199.110.153, 185.199.109.153, ...
Connecting to gebakx.github.io (gebakx.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 125822 (123K) [application/octet-stream]
Saving to: ‘data/train.tgz.4’


2019-12-06 13:39:06 (2,91 MB/s) - ‘data/train.tgz.4’ saved [125822/125822]

--2019-12-06 13:39:06--  https://gebakx.github.io/ihlt/sts/resources/test-gold.tgz
Resolving gebakx.github.io (gebakx.github.io)... 185.199.111.153, 185.199.110.153, 185.199.109.153, ...
Connecting to gebakx.github.io (gebakx.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118094 (115K) [application/octet-stream]
Saving to: ‘data/test-gold.tgz.4’


2019-12-06 13:39:07 (2,66 MB/s) - ‘data/test-gold.tgz.4’ saved [118094/118094]

/home/nhikia/Documents/AI/IHLT/IHLT-MAI/lab/

## Corpus assembly

In [2]:
import os
import numpy as np
train_files = ['MSRpar', 'MSRvid', 'SMTeuroparl']
train_data = []
train_labels = []
for file in train_files:
    with open(os.path.join('data', 'train', 'STS.input.' + file + '.txt'), 'r') as f:
        train_data += [sent.split('\t') for sent in f.readlines()]
    with open(os.path.join('data', 'train', 'STS.gs.' + file + '.txt'), 'r') as f:
        train_labels += [float(num) for num in f.readlines()]
train_data = np.array(train_data)
train_labels = np.array(train_labels)

test_files = ['MSRpar', 'MSRvid', 'SMTeuroparl', 'surprise.OnWN', 'surprise.SMTnews']
test_data = []
test_labels = []
for file in test_files:
    with open(os.path.join('data', 'test-gold', 'STS.input.' + file + '.txt'), 'r') as f:
        test_data += [sent.split('\t') for sent in f.readlines()]
    with open(os.path.join('data', 'test-gold', 'STS.gs.'+ file + '.txt'), 'r') as f:
        test_labels += [float(num) for num in f.readlines()]
test_data = np.array(test_data)
test_labels = np.array(test_labels)


## Alternative 1: Linguistic feature engineering and classical machine learning

### Extract features

In [3]:
import nltk

def sent2features(sent):
    features = []
    features += nltk.word_tokenize(sent)
    # ...
    return features


pairs_of_features = [(sent2features(sent1), sent2features(sent2)) for sent1, sent2 in train_data]

### Compute distances between features

In [4]:
from nltk.metrics import jaccard_distance

def distance(features1, features2):
    distances = []
    distances.append(jaccard_distance(set(features1), set(features2)))
    # ...
    return np.array(distances)


distances = np.array([distance(features1, features2) for features1, features2 in pairs_of_features])

### Linear regression

In [5]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(distances, train_labels)

## Validate

In [6]:
from scipy import stats
import sklearn
def evaluate(true_labels, predicted_labels):
    pearson, p_value = stats.pearsonr(true_labels, predicted_labels)
    return pearson, p_value
def cross_validate(data, labels, model, n_folds=5, seed=1):
    kf = sklearn.model_selection.KFold(n_splits=n_folds, random_state=seed)
    average_pearson = 0
    for train_index, val_index in kf.split(data):
        X_train, X_val = data[train_index], data[val_index]
        y_train, y_val = labels[train_index], labels[val_index]
        m = model.fit(X_train, y_train)
        predicted_labels = model.predict(X_val)
        pearson, _ = evaluate(y_val, predicted_labels)
        average_pearson += pearson
    return average_pearson/n_folds

In [7]:
cross_validate(distances, train_labels, LinearRegression())

0.4109743529779384

## Alternative 2: Transfer learning