# Problem description

Cross-lingual document classification (CLDC) is the text mining problem where we are given:
- labeled documents for training in a source language $\ell_1$, and 
- test documents written in a target language $\ell_2$. 

For example, the training documents are written in English, and the test documents are written in French. 


CLDC is an interesting problem. The hope is that we can use resource-rich languages to train models that can be applied to resource-deprived languages. This would result in transferring knowledge from one language to another. 
There are several methods that can be used in this context. In this workshop we start from naive approaches and progressively introduce more complex solutions. 

The most naive solution is to ignore the fact the training and test documents are written in different languages.  

In [33]:
import pandas as pd
from ast import literal_eval
from sklearn.metrics import accuracy_score,f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y
from sklearn.utils.multiclass import unique_labels
from sklearn.feature_extraction.text import CountVectorizer

from collections import Counter
from utils import *

1. Dataset: holds the data of sources and target language
2. System: This is a set of steps: Does fit, predict. Can be in the form of a pipeline also
3. Experiment: Given a Dataset and a System it fits, predicts and reports evaluation scores

In [40]:
class Dataset:
    """Experiment class, that reads data in raw format and prints stats."""
    def __init__(self, pathtodata,source_lang, target_lang):
        self.source_lang = source_lang
        self.target_lang = target_lang
        self.tr_path = pathtodata+"semeval15.%s.train.csv" % source_lang
        self.te_path = pathtodata+"semeval15.%s.test.csv" % target_lang
    
    @staticmethod
    def read_csv(path):
        df = pd.read_csv(path)
        df['polarities'] = df['polarities'].apply(lambda l: literal_eval(l))
        df = df.loc[df.polarities.astype(bool)]
        df['sentiment'] = df['polarities'].apply(lambda l: Counter(l).most_common(1)[0][0])
        return df[['text', 'sentiment']]

    def load_data(self):
        training = self.read_csv(self.tr_path)
        test = self.read_csv(self.te_path)
        print("\nTraining data\n==========")
        self.calculate_stats(training)
        print("\nTraining data\n==========")
        self.calculate_stats(test)
        self.train, self.y_train = training.text.values, training.sentiment.values
        self.test, self.y_test = test.text.values, test.sentiment.values

    # Function to load the Cross-lingual embeddings for each language
    def load_cl_embeddings(self,path_to_embeddings,dimension,skip_header):
        self.vocab_source = fit_vocab(self.train)
        self.vocab_target = fit_vocab(self.test)
        
        # full vocabulary
        self.vocab_ = fit_vocab(np.concatenate((self.train,self.test)))
        
        self.source_embeddings = load_embeddings(path_to_embeddings+"concept_net_1706.300."+self.source_lang, dimension,skip_header=skip_header,vocab=self.vocab_)
        self.target_embeddings = load_embeddings(path_to_embeddings+"concept_net_1706.300."+self.target_lang, dimension,skip_header=skip_header,vocab=self.vocab_)
        
        self.source_embeddings = sort_embeddings(self.source_embeddings,self.vocab_)
        self.target_embeddings = sort_embeddings(self.target_embeddings,self.vocab_)
        
        
    def calculate_stats(self, df):
        print("Training Data Shape: ", df.shape)
        print("Class distribution: ", df.sentiment.value_counts().to_dict())

        
class Runner:
    def __init__(self, pipeline, experiment):
        self.pipeline = pipeline
        self.experiment = experiment
        #self.experiment.load_data()
        
    def score(self, preds):
        #return accuracy_score(exp.y_test, preds)
        return f1_score(exp.y_test, preds,average="macro")
    def eval_system(self):
        pipeline.fit(exp.train, exp.y_train)
        preds = pipeline.predict(exp.test)
        scores = self.score(preds)
        return scores

In [47]:
class nBowClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, base_classifier = "knn",V_source=None,V_target=None,params={}):
        self.base_classifier = base_classifier
        self.V_source = V_source
        self.V_target = V_target
        self.params = params
        
        if base_classifier =="knn":
            self.clf = KNeighborsClassifier(**params)
        #elif base_classifier=="mlp" #TODO: add support for an MPL classifier
        else:
            raise ValueError("Unknown base classifier")

    # neural bag-of-words baseline
    # average word embeddings of each document
    # V_emb: this holds the embedding of each word
    # X: the vectorized array of documents. Note that the indices of the features should correspond to the same indices in the V_emb array
    def _nBOW(self,V_emb,X):
        X_avg = []
        for doc in X:
            doc_vecs = V_emb[doc.indices,:]
            avg_vec = np.sum((doc_vecs*doc.data[:,np.newaxis]),axis=0)/(doc.data.sum() + 1.0)
            X_avg.append(avg_vec)
        
        return np.array(X_avg)
    
    def fit(self, X, y):

        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        
        X_avg = self._nBOW(self.V_source,X)
        self.n_samples, self.n_features = X_avg.shape
        
        # Check that X and y have correct shape
        X_avg, y = check_X_y(X_avg, y,accept_sparse=False)
        
        self.clf.fit(X_avg,y)
        # Return the classifier
        return self

    def predict(self, X):

        # Check is fit had been called
        # check_is_fitted(self, ['X_', 'y_'])

        # Input validation
        #X = check_array(X)
        X_avg = self._nBOW(self.V_target,X)
        predictions = self.clf.predict(X_avg)
        
        return predictions

In [42]:
exp = Dataset("../NLP/sentiment_classification/data/clean_data/","en", "es")
exp.load_data()
exp.load_cl_embeddings("../NLP/sentiment_classification/embeddings/",300,False)



Training data
Training Data Shape:  (1708, 2)
Class distribution:  {'positive': 1114, 'negative': 521, 'neutral': 73}

Training data
Training Data Shape:  (677, 2)
Class distribution:  {'positive': 455, 'negative': 189, 'neutral': 33}
Loaded 3383 vectors
Loaded 1169 vectors


In [43]:
# Majority Class
pipeline = Pipeline([('vectorizer', CountVectorizer()), 
                     ('classifier', DummyClassifier())])
runner = Runner(pipeline, exp)
runner.eval_system()

0.327541248060116

In [55]:
# Logistic Regression on words
pipeline = Pipeline([('vectorizer', CountVectorizer(lowercase=True)), 
                     ('classifier', LogisticRegression(solver="lbfgs"))])
runner = Runner(pipeline, exp)
runner.eval_system()

  'precision', 'predicted', average, warn_for)


0.38649930974693864

In [60]:
params = {"n_neighbors":5}
avg_baseline = nBowClassifier("knn",exp.source_embeddings,exp.target_embeddings,params)

pipeline = Pipeline([('vectorizer', CountVectorizer(lowercase=True,vocabulary=exp.vocab_)), 
                     ('classifier', avg_baseline)])

runner = Runner(pipeline, exp)
runner.eval_system()

0.49088213651906676