# Testing hypothesis

Within this Jupyter notebook, we will finally use the test set. We will evaluate the classifier accuracies for the different classifiers we constructed. This will allow us to evaluate our hypotheses. First, let us load the packages and necessary data.

In [1]:
# Import necessary packages
import pandas as pd
import numpy as np
import sys
import random
import math
import gc
from functools import reduce
from nltk import FreqDist, ngrams, sent_tokenize, word_tokenize
from nltk.tokenize import word_tokenize
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from scipy import sparse
import pickle

def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
        
def load_object(filename):
    with open(filename, 'rb') as f:
        x = pickle.load(f)
    return(x)

In [2]:
# Load the training data. Scramble the rows (sometimes this is important for training)
df = pd.read_csv("python_data/train",sep="\t",error_bad_lines=False,encoding="utf-8")
df = df.sample(frac=1, random_state = 54021)
df['native'] = np.where(df['native_lang']=='EN', "native", "non-native")

# Load the training data. Downsample non-English such that it is balanced.
print("Loading the training and test data...")
training = pd.concat([df[df.native == "non-native"].sample(sum(df.native == "native"), random_state = 1810), df[df.native=="native"]])
training = training.sample(frac=1, random_state = 1318910)
training.native = training.native.astype('category')

# Load the test data. Again, downsample such that it is balanced.
test = pd.read_csv("python_data/test",sep="\t",error_bad_lines=False,encoding="utf-8")
test['native'] = np.where(test['native_lang']=='EN', "native", "non-native")
test = pd.concat([test[test.native == "non-native"].sample(sum(test.native == "native"), random_state = 1), test[test.native=="native"]])
test.native = test.native.astype('category')

training.to_csv("python_data/training_final")
test.to_csv("python_data/test_final")

Loading the training and test data...


### The non-linear kernel vs. the linear SVM classifier.

In [2]:
def compute_similarity_score(dis_ngramdic, gramlist):
    """ This function computes the similarity scores for a comment based on the corresponding k-grams.
    Note that the comment is already tokenized into sentences.
    @dis_ngramdic: ngram dictionary as constructed by language_distribution for particular k.
    @gramlist: list of kgrams
    """
    score=0
    if gramlist:
        for gram in gramlist:
            score += math.log2(dis_ngramdic.get(gram,1))
    return score

colnames = None

def compute_all_features(lang_dis, original_text, clean_text, structure_text):
    """ This function compares the sentences and structure to each of the languages distributions. It returns
    similarity scores to each language model. Also included are other features, such as the number of sentences
    per text, etc.
    @lang_dis: Language distribution of n-grams.
    @clean_text: Text with proper nouns and demonyms substituted
    @structure_text: PoS structure retrieved by SENNA.
    """
    simscoredict=dict()
    
    # For each gramtype, first construct the list of which we can make n-grams.
    words_ps = list(word_tokenize(clean_text))
    struc_ps = list(word_tokenize(structure_text))
    wordlens_ps = [len(word) for word in word_tokenize(original_text) if word.isalpha()]
    
    # Now we should construct k-gram lists for each k and return the score. Let us store all grams in 
    for gramtype in lang_dis[list(lang_dis.keys())[0]].keys():
        
        # Select appropriate data type.
        if gramtype == "tags":
            ps = struc_ps
        elif gramtype =="words":
            ps = words_ps
        elif gramtype == "w_sizes":
            ps = wordlens_ps
        elif gramtype == "chars":
            ps = clean_text
        
        # We need to normalize with the sequence length.
        seq_len = len(ps)

        # For each k, feed the ngrams function into the compute_similarity_score function. 
        for k in range(1,len(lang_dis[list(lang_dis.keys())[0]][gramtype])+1):
            for lang in lang_dis.keys():
                simscoredict[lang+'_'+gramtype+'_'+str(k)]= compute_similarity_score(lang_dis[lang][gramtype][k], ngrams(ps,k))/seq_len
    
    # Set the other features they use in the paper.
    simscoredict["num_sentences"] = len(list(sent_tokenize(clean_text)))
    simscoredict["num_words"] = len(wordlens_ps)
    simscoredict["avg_wordlength"] = sum(wordlens_ps)/len(wordlens_ps)
        
    global colnames
    if colnames == None:
        colnames = list(simscoredict.keys())
            
    return simscoredict.values()

In [3]:
# Load the language distribution for the training data.
test = pd.read_csv("python_data/test_final")
lang_dis = load_object("trained_lang_dis")
lowerlim = 10

print("Computing the features for test data")
features = test.apply(lambda row: compute_all_features(lang_dis,row['text_original'],row['text_clean'], row['text_structure']), axis=1)
features = pd.DataFrame(features.to_frame()[0].values.tolist(), index=features.to_frame()[0].index, columns=colnames)
test = pd.merge(test, features, left_index=True, right_index=True)
print("Finished computing features")

# Clean up stuff we no longer need.
lang_dis.clear()
features = features.iloc[0:0]
test = test.drop(['text_original','text_clean','text_structure'], axis = 1)

# Write the training and test including their features to file.
test.to_csv("python_data/test_features_4_4_"+str(lowerlim))

Computing the features for test data


  if __name__ == '__main__':


Finished computing features


Now that we have our features, we get to the classifier. Let us train the linear SVM classifier for default settings and with the non-linear kernel using Bagging.

In [2]:
# Load the feature set.
training = pd.read_csv("python_data/training_features_4_4_10",index_col=0,header=0)
test = pd.read_csv("python_data/test_features_4_4_10",index_col = 0, header=0)
colnames = training.columns[3:]    #First column native language, second English level, third if native.
test.native = test.native.astype('category')
training.native = training.native.astype('category')

In [3]:
# Train the SVC classifier
linear = svm.LinearSVC(C=1, penalty="l1", dual=False)
linear.fit(training[colnames], training.native)
y_predicted = linear.predict(test[colnames])
accur = accuracy_score(test.native, y_predicted)
print("Accuracy on the test set: {}".format(accur))

Accuracy on the test set: 0.7196239381502338


In [4]:
# Scaling.
scaler = StandardScaler()
training[colnames] = scaler.fit_transform(training[colnames])
test_wordcounts = test.num_words
test[colnames] = scaler.transform(test[colnames])

# Train SVCs by bagging. 20 estimators will yield approximately 10.000 samples per SVM. This number should be feasible
# according to the documentation. We can speed up things by multithreading (n_jobs).
n_estimators = 20
clf = BaggingClassifier(svm.SVC(kernel='rbf', C=2**3, gamma=2**-11, cache_size=2000), random_state = 1281, max_samples=1.0 / n_estimators, n_jobs = 4, n_estimators=n_estimators)
print("Fitting the classifier")
clf.fit(training[colnames],training.native)
print("Prediction out-of-sample")
y_predicted = clf.predict(test[colnames])
print("Bagging SVC score:",accuracy_score(test.native, y_predicted))

# Save predictions.
y_predicted = pd.DataFrame(y_predicted, test.index)
y_predicted.columns = ["prediction"]
dfs = [test[["native_lang","level_english","native"]], y_predicted, pd.DataFrame(test_wordcounts, test.index)]
y_predicted = reduce(lambda left,right: pd.merge(left,right, left_index= True, right_index = True), dfs)
y_predicted.to_csv("output_SVM_RBF_classifier_TEST")

Fitting the classifier
Prediction out-of-sample
Bagging SVC score: 0.738808819319


## Classifying based on the different distributions.

Here, we evaluate accuracy on test data based on DTM. First, prepare features for test data.

In [2]:
def construct_lodds_ratio_dict(fname, lb):
    """ 
    In this function, we want to compute the odds ratio for each of the n-grams, and return a dictionary with these values.
    For each non-English language, we will add a pseudocount of .5 to prevent divisions by 0. We return an approximate lower
    bound at the alpha confidence level.
    @fname: File to load the language distribution from
    @alpha: level of alpha of lower bound. 
    """
    
    lang_dis = load_object(fname)
    
    n = len(lang_dis['EN']['words'])
    m = len(lang_dis['EN']['chars'])
    
    for lang in lang_dis.keys():
        if lang == 'EN':
            continue
        for gramtype in lang_dis[lang].keys():  
            for k in lang_dis[lang][gramtype].keys():
                b = sum(lang_dis[lang][gramtype][k].values()) + .5   #Total grams in foreign language
                d = sum(lang_dis['EN'][gramtype][k].values()) + .5   #Total grams in English
                for key in list(lang_dis[lang][gramtype][k].keys()):
                    
                    # Obtain the value by pop, i.e. delete key from dictionary.
                    a = lang_dis[lang][gramtype][k].pop(key,0) +.5   #Gram count for particular gram in foreign language
                    c = lang_dis['EN'][gramtype][k].get(key,0) +.5   #Gram count for particular gram in English
                    
                    if gramtype == "words" and "NNP" in key:
                        continue
                    
                    # If it occurs more often than the lower bound, set value to the lowerbound of odds ratio.
                    if a > lb:
                        lang_dis[lang][gramtype][k][key] = math.log((a*d)/(b*c))  # Calculate the log-odds ratio 
                    
    # Remove English from the language dictionary.
    lang_dis["EN"].clear()

    return(lang_dis)

In [3]:
lodds_ratio = construct_lodds_ratio_dict("trained_lang_dis_20_lang_ll10", 5)
gc.collect()

0

In [4]:
training = pd.read_csv("python_data/training_final")
test = pd.read_csv("python_data/test_final")

from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok
detokenizer = Detok()
word_gram_list = []
char_gram_list = []
struc_gram_list = []
for lang in lodds_ratio.keys():
    if lang == "EN":
        continue
    for gramtype in lodds_ratio[lang].keys():
        for k in lodds_ratio[lang][gramtype].keys():
            for key,v in lodds_ratio[lang][gramtype][k].items():
                if v>math.log(6/5) or v<math.log(5/6):
                    if gramtype == "words":
                        word_gram_list.append(key)
                    if gramtype == "chars":
                        char_gram_list.append(key)
                    if gramtype =="tags":
                        struc_gram_list.append(key)
word_gram_list = set([detokenizer.detokenize(gram) for gram in set(word_gram_list)])
struc_gram_list = set([detokenizer.detokenize(gram) for gram in set(struc_gram_list)])
char_gram_list = set([''.join(gram) for gram in set(char_gram_list)])
lodds_ratio.clear()
gc.collect()
print(len(word_gram_list))
print(len(char_gram_list))
print(len(struc_gram_list))

130249
80360
60487


In [5]:
# create the transform for words, characters and structure.
word_vectorizer = CountVectorizer(ngram_range=(1, 4), vocabulary = word_gram_list, lowercase=False)
struc_vectorizer = CountVectorizer(ngram_range=(1, 4), vocabulary = struc_gram_list, lowercase=False)
char_vectorizer = CountVectorizer(ngram_range=(1, 4), vocabulary = char_gram_list, analyzer="char", lowercase=False)

In [6]:
test_word_vector = word_vectorizer.transform(test["text_clean"])
test_char_vector = char_vectorizer.transform(test["text_clean"])
test_struc_vector = struc_vectorizer.transform(test["text_structure"])
sparse.save_npz("test_word_vector",test_word_vector)
sparse.save_npz("test_struc_vector",test_struc_vector)
sparse.save_npz("test_char_vector",test_char_vector)

Classification starts here. Do it for all gram types.

In [2]:
training = pd.read_csv("python_data/training_final")
test = pd.read_csv("python_data/test_final")

training_grams = sparse.load_npz("word_vector.npz")
test_grams = sparse.load_npz("test_word_vector.npz")
clf = MultinomialNB().fit(training_grams, training.native)
predicted = clf.predict(test_grams)
accuracy = 1-sum(predicted != test.native)/len(predicted)
print("Classifying based on word-grams gives NB accuracy of {}%".format(accuracy))
clf = SGDClassifier(loss="hinge",penalty="l2",alpha=1e-4, random_state=42, max_iter=500, tol=None)
clf.fit(training_grams, training.native)
predicted_word = clf.predict(test_grams)
accuracy = 1-sum(predicted_word != test.native)/len(predicted_word)
print("Classifying based on word-grams with SVM with gradient descent gives accuracy of {}%".format(accuracy*100))

training_grams = sparse.load_npz("char_vector.npz")
test_grams = sparse.load_npz("test_char_vector.npz")
clf = MultinomialNB().fit(training_grams, training.native)
predicted = clf.predict(test_grams)
accuracy = 1-sum(predicted != test.native)/len(predicted)
print("Classifying based on character grams gives NB accuracy of {}%".format(accuracy))
clf = SGDClassifier(loss="hinge",penalty="l2",alpha=1e-4, random_state=42, max_iter=500, tol=None)
clf.fit(training_grams, training.native)
predicted_char = clf.predict(test_grams)
accuracy = 1-sum(predicted_char != test.native)/len(predicted_char)
print("Classifying based on character grams with SVM with gradient descent gives accuracy of {}%".format(accuracy*100))

training_grams = sparse.load_npz("struc_vector.npz")
test_grams = sparse.load_npz("test_struc_vector.npz")
clf = MultinomialNB().fit(training_grams, training.native)
predicted = clf.predict(test_grams)
accuracy = 1-sum(predicted != test.native)/len(predicted)
print("Classifying based on structure gives NB accuracy of {}%".format(accuracy))
clf = SGDClassifier(loss="hinge",penalty="l2",alpha=1e-4, random_state=42, max_iter=500, tol=None)
clf.fit(training_grams, training.native)
predicted_struc = clf.predict(test_grams)
accuracy = 1-sum(predicted_struc != test.native)/len(predicted_struc)
print("Classifying based on structure with SVM with gradient descent gives accuracy of {}%".format(accuracy*100))

Classifying based on word-grams gives NB accuracy of 0.6967643409372912%
Classifying based on word-grams with SVM with gradient descent gives accuracy of 72.24237218033151%
Classifying based on character grams gives NB accuracy of 0.6490884795265821%
Classifying based on character grams with SVM with gradient descent gives accuracy of 71.8208138462028%
Classifying based on structure gives NB accuracy of 0.5802392542394451%
Classifying based on structure with SVM with gradient descent gives accuracy of 60.45146511405937%


In [3]:
predicted_struc = pd.DataFrame(predicted_struc, test.index)
predicted_word = pd.DataFrame(predicted_word, test.index)
predicted_char = pd.DataFrame(predicted_char, test.index)
predicted_struc.columns = ["prediction_struc"]
predicted_char.columns = ["prediction_chars"]
predicted_word.columns = ["prediction_words"]

# Merge the predictions and test dataframe.
dfs = [test, predicted_struc, predicted_char, predicted_word]
df_final = reduce(lambda left,right: pd.merge(left,right, left_index= True, right_index = True), dfs)
df_final = df_final.drop(['text_original','text_structure'],1)
df_final.to_csv("output_DTM_classifiers_TEST")