# RATIO 2019 - Benchmarking Workshop

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
import csv

# Task 1 - Same Side Clasiification



In [2]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'

### Load within-topics and cross-topics data

In [6]:
cross_traindev_df = pd.read_csv(data_cross_path.format('training'), quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False,index_col='id')
cross_test_df =  pd.read_csv(data_cross_path.format('test'), index_col='id')

within_traindev_df =  pd.read_csv(data_within_path.format('training'),quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False,index_col='id')
within_test_df =  pd.read_csv(data_within_path.format('test'), index_col='id')


'i would like to start off by thanking my opponent for his first debate, and hope that he has many more to come. claiming that something should be maintain simply because it is tradition is a common falacy, known as appeal to tradition. examples of how this is not an adequate reason to preserve something by itself are such issues as slavery, and discrimination of women. simply because that is the way things were, does not mean that is the way thing ought be. i would also like to question my opponent\'s claim that, "i believe it is the child who chooses to become gay in a more mature state of mind." if a child can simply "choose" to be gay, then they can "choose" to not be gay. also, if someone is capable of such a choice, then my opponent would be able to simply become gay for 30 minutes, then become straight again, as would i and anyone else reading and voting on this debate. as of now, i have not found the way to choose to be gay. even so, i don\'t see how choosing or not choosing to

In [None]:
# Adding a tag for the topics in focus: "gay marriage" and "abortion"
def add_tag(row):
    title = row['topic'].lower().strip()
    if title.find('abortion') > -1 :
        row['tag'] = 'abortion'
    elif title.find('gay marriage') > -1 :
        row['tag'] = 'gay marriage'
    else:
        row['tag'] = 'NA'
    return row

cross_traindev_df = cross_traindev_df.apply(add_tag, axis=1)
cross_test_df = cross_test_df.apply(add_tag, axis=1)

within_traindev_df = within_traindev_df.apply(add_tag, axis=1)
within_test_df = within_test_df.apply(add_tag, axis=1)



In [None]:
within_traindev_df['tag'].unique()

In [None]:
#74517
within_traindev_df.loc[82134]['argument2']
#within_traindev_df.loc[74517]['argument2']

In [None]:
within_traindev_df[(within_traindev_df['tag'] == 'gay marriage') & (within_traindev_df['tag'] == 'gay marriage')]

In [None]:
#within_traindev_df[(within_traindev_df['tag'] == 'gay marriage') and (within_traindev_df['is_same_side'] == 'True')]

### Get an overview about each dataset

In [None]:

def get_overview(df, task = 'same-side', class_name = 'is_same_side'):
    # Total instance numbers
    total = len(df)
    print("Task: ", task, '\n\n')
    print('Total instances: ', total)
    print()
    print()
    
    print('For each topic:')
    for tag, tag_df in df.groupby(['tag']):
        print(tag, ': ', len(tag_df), ' instances')
        
        if class_name in df.columns:
            for is_same_side, side_df in tag_df.groupby([class_name]):
                print('\t\t',is_same_side, ': ', len(side_df), ' instances')
            
    print()
    print()
    
    if class_name in df.columns:
        print('For each class value:')
        for class_value, class_df in df.groupby([class_name]):
            print(class_value, ': ', len(class_df), ' instances')

        print()
        print()

    print('Unique argument1:', len(df['argument1'].unique()))
    print('Unique argument2:', len(df['argument2'].unique()))
    arguments = df['argument1'].values
    arguments = np.concatenate([arguments, df['argument2'].values])

    print('Unique total arguments:', len(set(list(arguments))))
    print()
    print()

    arguments_length_lst = [len(word_tokenize(x)) for x in df['argument1'].values]
    arguments_length_lst.extend([len(word_tokenize(x)) for x in df['argument2'].values])
    print('Words:')
    print('shortest argument:', min(arguments_length_lst), ' words')
    print('longest argument:', max(arguments_length_lst), ' words')
    print('aargument average length:', np.mean(arguments_length_lst), ' words')




    arguments_sent_length_lst = [len(sent_tokenize(x)) for x in df['argument1'].values]
    arguments_sent_length_lst.extend([len(sent_tokenize(x)) for x in df['argument2'].values])
    print('Sentences:')
    print('shortest argument:', min(arguments_sent_length_lst), ' sentences')
    print('longest argument:', max(arguments_sent_length_lst), ' sentences')
    print('aargument average length:', np.mean(arguments_sent_length_lst), ' sentences')
   

In [None]:

get_overview(cross_traindev_df)

In [None]:
get_overview(within_traindev_df)

## Train model - Baseline

### train dev set - 70% 30%

In [None]:
from sklearn.model_selection import train_test_split
import nltk
nltk.download('wordnet')
def get_train_test_sets(df):
    X = df[['argument1', 'argument2', 'topic']]
    y = df[['is_same_side']]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, shuffle=True)
    return X_train, X_test, y_train, y_test 
    


### lemmatizing

In [None]:
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer


def get_wordnet_pos(treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

def lemmatize_stemming(token, pos_tag):
    stemmer = SnowballStemmer("english") #pOrter, M. "An algorithm for suffix stripping."
    return stemmer.stem(WordNetLemmatizer().lemmatize(token, pos=pos_tag))

def preprocess(text):
    lemma = []
    for sentence in sent_tokenize(text):
        sentence = sentence.replace('\n', ' ').strip()
        tokens = [token for token in word_tokenize(sentence)]
        pos_tags = nltk.pos_tag(tokens)
        
        for idx in range(0,len(tokens)):
            token = tokens[idx].lower()
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                wordnet_pos = get_wordnet_pos(pos_tags[idx][1])
                l_ = lemmatize_stemming(token, wordnet_pos)
                lemma.append(l_)
    return ' '.join(lemma)

In [None]:
def get_lemma(row):
    row['argument1_lemmas'] = preprocess(row['argument1'])
    row['argument2_lemmas'] = preprocess(row['argument2'])
    return row



### Extracting n grams lemma for argument1 and argument2

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
def extract_ngrams(X_train, X_dev, col, idx='id'):
    vectorizer = CountVectorizer(min_df=600, max_df=0.7, ngram_range=(3, 3), max_features=5000 )
    
    vectorizer.fit(X_train[col])
    features = vectorizer.transform(X_train[col])
    features_dev = vectorizer.transform(X_dev[col])

    train_df =pd.DataFrame(
        features.todense(),
        columns=vectorizer.get_feature_names()
    )
    train_df = train_df.add_prefix(col)

    
    aid_df = X_train[[idx]]

    train_df = train_df.merge(aid_df, left_index =True, right_index=True, suffixes=(False, False), how='inner')
    train_df.set_index(idx, inplace=True)    
    
    dev_df =pd.DataFrame(
        features_dev.todense(),
        columns=vectorizer.get_feature_names()
    )
    dev_df = dev_df.add_prefix(col)

    
    aid_dev_df = X_dev[[idx]]

    dev_df = dev_df.merge(aid_dev_df, left_index =True, right_index=True, suffixes=(False, False), how='inner')
    dev_df.set_index(idx, inplace=True)
    return train_df, dev_df

def extract_n_grams_features(X_train, X_dev, columns, idx='id'): 

    X_train = X_train.reset_index()
    result_train_df =  X_train[[idx]]
    result_train_df.set_index(idx, inplace=True)
    
    
    X_dev = X_dev.reset_index()
    result_dev_df =  X_dev[[idx]]
    result_dev_df.set_index(idx, inplace=True)
    
    for col in columns:
        result_train_df_, result_dev_df_ = extract_ngrams(X_train, X_dev, col)
        result_train_df = result_train_df.join(result_train_df_)
        result_dev_df = result_dev_df.join(result_dev_df_)
    return result_train_df, result_dev_df



### Train model and evaluate

In [None]:
from sklearn.svm import SVC  
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

def train_test_svm(X_train, y_train, X_test):
    scaler = StandardScaler(copy=True, with_mean=False)
    scaler.fit(X_train)
    
    X_train = scaler.transform(X_train)
    
    svclassifier = SVC(kernel='linear')  
    svclassifier.fit(X_train, y_train)  
    
    X_test = scaler.transform(X_test)
    y_pred = svclassifier.predict(X_test) 

    return y_pred
def report_training_results(y_test, y_pred):
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))  
    print()
    print('Accuracy: ', round(accuracy_score(y_test, y_pred), 2))  #
    print()

    print('Report:')
    print(classification_report(y_test, y_pred))  
    f1_dic = {}
    
    f1_dic['macro'] = round(f1_score(y_pred=y_pred, y_true=y_test, average='macro'), 2)
    f1_dic['micro'] = round(f1_score(y_pred=y_pred, y_true=y_test, average='micro'), 2)
    return f1_dic



### Cross topic - Training and evaluating model 

In [None]:
# 1. Getting train and dev data
X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df)


# 2. Lemmatizing argument1 and argument2
X_train = X_train.apply(get_lemma, axis=1)
X_dev = X_dev.apply(get_lemma, axis=1)

# 3. Extracting features - 1-3 grams lemma
X_train_, X_dev_ = extract_n_grams_features(X_train, X_dev, columns=['argument1_lemmas', 'argument2_lemmas'])

# 4 train 
y_pred = train_test_svm(X_train_, y_train, X_dev_)

# 5 Evaluate
report_training_results(y_dev, y_pred)