In [1]:
#!/usr/bin/python2.7

import re
import nltk
import pandas as pd
import random
import numpy as np
import csv
from collections import defaultdict

stopwords = set(nltk.corpus.stopwords.words('english'))

cities = {}
Countries = {}
def read_cities():
    with open('cities.csv') as f:
        reader = csv.reader(f)
        # skip header
        next(reader)
        for row in reader:
            cities[row[0].lower()] = 1

def read_countries():
    with open('Countries.csv') as f:
        reader = csv.reader(f)
        # skip header
        next(reader)
        for row in reader:
            Countries[row[0].lower()] = 1
            
def words_filter(word):
    """
    Function to remove out all empty string.
    """
    if word == '':
        return False
    else:
        return True

def parse_words(word):
    """
    Function to remove . 's ' from the last parts of the words and " ( ) ,  from
    beginning or last part of the word.
    """    
    word = word.strip('\"').rstrip('.').rstrip('\'').strip(',').strip('(')\
                                                               .strip(')')\
                                                               .strip('?')\
                                                               .strip(':')
    if word.endswith('\'s'):
        word = word.rstrip('\'s')
    return word

def filter_training_set(string):
    """
    Function to remove out too many negatives from training set. The current
    filter accepts elements into training set only if at least one of the
    elements is capitalized.
    """
    for word in string.split(" "):
        if word == '':
            return False
        if word[0].isupper():
            return True
    return False

def label(string, names):
    """
    Just return positive or negative label.
    """
    if string in names:
        return 1
    return 0

def feature_is_all_capitals(string):
    """
    If all the words in the string are capitalized, return True.
    """
    words = string.split(" ")
    for word in words:
        if word == '' or word[0].islower():
            return 0
    return 1

def feature_possessive_form(string, content):
    """
    If the last word in the string end's with 's or ', return True.
    """
    last_word = string.strip(" ").split(" ")[-1]
    assert(last_word)
    match_obj = re.search(last_word + r"\'s?", content)
    if match_obj:
        return 1
    else:
        return 0

def feature_num_capitals(string):
    """
    Return the number of capital letters in the string.
    """
    count = 0
    for char in string:
        if char.isupper():
            count += 1
    return count

def feature_num_vowels(string):
    """
    Return the number of vowels in the string.
    """
    count = 0
    for char in string:
        if char.lower() in 'aeiou':
            count += 1
    return count

def feature_num_consonants(string):
    """
    Return the number of consonants in the string.
    """
    count = 0
    for char in string:
        if char.isalpha() and char.lower() not in 'aeiou':
            count += 1
    return count

def feature_num_characters(string):
    """
    Return the number of characters in the string.
    """
    return len(string)

def feature_ascii_sum(string):
    """
    Return the sum of ascii values of all the characters in the string.
    """
    ascii_sum = 0
    for char in string:
        ascii_sum += ord(char)
    return ascii_sum

def feature_number_present(string):
    """
    If the string has a digit, then return True.
    """
    for char in string:
        if char.isdigit():
            return 1
    return 0

def feature_is_noun(string):
    """
    If majority if the words in this string are tagged propernoun,
    then return True.
    nltk.help.upenn_tagset() tags that we are looking for
    NNP: noun, proper, singular
        Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
        Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
        Shannon A.K.C. Meltex Liverpool ...
    NNPS: noun, proper, plural
        Americans Americas Amharas Amityvilles Amusements Anarcho-Syndicalists
        Andalusians Andes Andruses Angels Animals Anthony Antilles Antiques
        Apache Apaches Apocrypha ...
    """
    tokens = nltk.word_tokenize(string)
    tagged_tokens = nltk.pos_tag(tokens)
    for entry in tagged_tokens:
        tag = entry[1]
        if tag not in ['NNP', 'NNPS']:
            return 0
    return 1

def feature_is_day_month(string):
    """
    If one of the words in the string contains days of the week, or months return true
    else return false.
    """
    days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday",
                    "saturday", "sunday"]
    months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
    words = string.split(" ")
    for word in words:
        if word.lower() in days_of_week or word.lower() in months:
            return 1
    return 0

def feature_contains_stopwords(string):
    """
    If one of the words in the string contains stopwords, return true
    else return false.
    """
    words = string.split(" ")
    for word in words:
        if word.lower() in stopwords:
            return 1
    return 0

def feature_contains_city(string):
    """
    If one of the words in the string is acity return true
    else return false
    """
    words = string.split(" ")
    for word in words:
        if cities.get(word.lower()):
            return 1
    return 0

def feature_contains_Countries(string):
    """
    If one of the words in the string is a Country return true
    else return false
    """
    words = string.split(" ")
    for word in words:
        if Countries.get(word.lower()):
            return 1
    return 0

def feature_begins_sentence(string, content):
    """
    If the first word occurs first in the sentence, the return true
    else return false
    """
    first_word = string.strip(" ").split(" ")[0]
    assert(first_word)
    match_obj = re.search( r"\. " + first_word, content)
    if match_obj:
        return 1
    else:
        return 0

def feature_contains_sports(string):
    """
    This feature is to identify common sports terms. It will return true if
    it is in one of these words
    """
    terms = ['city', 'football', 'league', 'arsenal', 'united', 'premier',
            'england', 'poland', 'champion', 'club', 'park', 'fc', 'director',
            'manager', 'people', 'aston', 'villa', 'brom', 'referee', 'defend',
            'midfield', 'win', 'anfield', 'devils', 'conference', 'olympic',
            'international', 'canada', 'tel', 'france', 'derby', 'world', 
            'player', 'milan', 'sport', 'fifa', 'uefa']
    
    for var in terms:
        if var in string.lower():
            return 1
    return 0


def feature_is_noun_v2(string):
    """
    Checks if a string is a noun using the context based dictionary
    """
    tokens = nltk.word_tokenize(string.lower())
    count = 0
    for token in tokens:
        pos_tag = global_pos_tag_vectors.get(token)
        if pos_tag == None:
            return 1
        if pos_tag[0] >= pos_tag[1]:
            count += 1
    if count > len(tokens)/2:
        return 1
    return 0


def feature_is_verb_v2(string):
    """
    Checks if a string is a verb using the context based dictionary
    """
    tokens = nltk.word_tokenize(string.lower())
    count = 0
    for token in tokens:
        pos_tag = global_pos_tag_vectors.get(token)
        if pos_tag == None:
            return 0
        if pos_tag[0] < pos_tag[1]:
            count += 1
    if count > len(tokens)/2:
        return 1
    return 0


# {word: [noun_hits, verb_hits]}
global_pos_tag_vectors = defaultdict(lambda: [0, 0])

def get_pos_tags(text):
    """
        this gets a pos tag list on the untouched text,
        so that nltk can use context to better predict
        pos tags.
        since a single word may be tagged differently in
        different contexts, we remember each instance in 
        a global dictionary and then take a popular
        vote to decide the pos tag.
    """

    # nltk.tokenize breaks when it faces stuff like £120m
    text = unicode(text, errors='ignore')
    words = nltk.word_tokenize(text)
    pos_tagged_words = nltk.pos_tag(words)
    for word, pos_tag in pos_tagged_words:
        if pos_tag in ['RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WRB']:
            global_pos_tag_vectors[word.lower()][1] += 1
        elif pos_tag in ['NNP', 'NNPS']:
            global_pos_tag_vectors[word.lower()][0] += 1


def get_feature_vectors(files):
    data = []
    read_cities()
    read_countries()
    count_names = 0
    for filename in files:

        # Read the whole marked file
        with open('../Tagged_Dataset/' + filename) as f:
            marked_contents = f.read()

        # Extract all the full names that are tagged in the file
        full_names = re.findall(r'<name>(.*?)</name>', marked_contents)
        
        # Extract all first names that tagged in the file
        first_names = re.findall(r'<fname>(.*?)</fname>', marked_contents)
        
        # Extract all the last names that are tagged in the file
        last_names = re.findall(r'<lname>(.*?)</lname>', marked_contents)

        for name in full_names:
            name_parts = name.split(' ')
            first_names.append(name_parts[0])
            last_names.append(name_parts[-1])
            if len(name_parts) == 3:
                last_names.append(" ".join(name_parts[1:]))

        file_names = set(full_names) | set(first_names) | set(last_names)
        count_names = count_names + len(file_names)
        
        # Read the file from the original dataset
        with open('../Original_Dataset/' + filename) as f:
            original_contents = f.read()

        # get pos tags on the untouched data
        get_pos_tags(original_contents)
        
        # Extract all the words and do any filters or parsing before generating
        # training set
        words = re.split(' |\n', original_contents)
        filtered_words = filter(words_filter, words)
        parsed_words = [parse_words(var) for var in filtered_words]

        # Add words of length one into training set
        training_set = []
        for i in xrange(len(parsed_words)):
            training_set.append(parsed_words[i])

        # Add words of length two into training set
        for i in xrange(len(parsed_words)-1):
            training_set.append(parsed_words[i] + " " + parsed_words[i+1])

        # Add words of length three into training set
        for i in xrange(len(parsed_words)-2):
            training_set.append(parsed_words[i] + " " + parsed_words[i+1] + " "
                                + parsed_words[i+2])

        filtered_training_set = filter(filter_training_set, training_set)

        # Construct feature vectors for the data
        for string in filtered_training_set:
            feature1 = feature_possessive_form(string, original_contents)
            feature2 = feature_num_capitals(string)
            feature3 = feature_num_vowels(string)
            feature4 = feature_num_consonants(string)
            feature5 = feature_num_characters(string)
            feature6 = feature_ascii_sum(string)
            feature7 = feature_number_present(string)
            feature8 = feature_is_noun(string)
            feature9 = feature_is_day_month(string)
            feature10 = feature_contains_stopwords(string)
            feature11 = feature_contains_city(string)
            feature12 = feature_begins_sentence(string, original_contents)
            feature13 = feature_contains_sports(string)
            feature14 = feature_is_noun_v2(string)
            feature15 = feature_is_verb_v2(string)
            feature16 = feature_contains_Countries(string)
            data.append((string, label(string, file_names),
                         feature1, feature2, feature3, feature4,
                         feature5, feature6, feature7, feature8,
                         feature9, feature10, feature11, feature12,
                         feature13, feature14, feature15, feature16
                        ))
    print "Length of data with features: " + str(len(data))
    print "Number of names marked up: " +str(count_names)
    return data

if __name__=="__main__":
    # Randomly split the 300 files into Set I (Development Set) and Set J (Test Set)
    input_files = [ str(i).zfill(3) + '.txt' for i in xrange(1, 301) ]
    
    # Using the constant seed so that we don't mix up Set I and J in subsequent iterations
    seed = 4
    random.Random(seed).shuffle(input_files)
    dev_set = input_files[:200]
    test_set = input_files[200:]
    assert(set(dev_set).union(set(test_set)) == set(input_files))
    print "Length of Dev set: " + str(len(dev_set))
    print "Length of Test set:" + str(len(test_set))
    
    # Generate feature vector for dev and test set
    print "Dev Set :"
    training_data = get_feature_vectors(dev_set)
    print "Test Set :"
    test_data = get_feature_vectors(test_set)

Length of Dev set: 200
Length of Test set:100
Dev Set :
Length of data with features: 57916
Number of names marked up: 3462
Test Set :
Length of data with features: 26064
Number of names marked up: 1567


In [2]:
# Importing the required packages and defining all functions needed to do the training

import json
import sys
from pandas.io.json import json_normalize
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
import pandas as pd
import numpy as np


# Function to perform training with Decision Tree - giniIndex.
def train_using_gini(X_train, y_train):
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion="gini", random_state=100)

    # Performing training
    clf_gini.fit(X_train, y_train)
    return clf_gini

# Function to perform training with giniIndex.
def train_using_logistic_regression(X_train, y_train):
    # Creating the classifier object
    lr_model = LogisticRegression(fit_intercept=True)
    
    # Performing training
    lr_model.fit(X_train, y_train)
    return lr_model

# Function to perform training with svm LinearSVC.
def train_using_svm_SVC(X_train, y_train):
    # Creating the classifier object
    svm_model = svm.SVC(C=5)
    
    # Performing training
    svm_model.fit(X_train, y_train)
    return svm_model

# Function to perform training with Decision Tree - entropy.
def train_using_entropy(X_train, y_train):
    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=100)

    # Performing training
    clf_entropy.fit(X_train, y_train)
    return clf_entropy

# Function to perform training random forest
def train_using_random_forest(X_train, y_train):
    # Creating the classifier object
    rf_model = RandomForestClassifier(n_estimators = 1000, random_state = 100)
    
    # Performing training
    rf_model.fit(X_train, y_train)
    return rf_model

# Function to perform training using Gaussian Naive Bayes
def train_using_gaussian_nb(X_train, y_train):
    # Creating the classifier object
    gaussian_nb_model = GaussianNB()
    
    # Performing training
    gaussian_nb_model.fit(X_train, y_train)
    return gaussian_nb_model
    
# Function to make predictions
def prediction(X_test, clf_object):
    y_pred = clf_object.predict(X_test)
    return y_pred

# Define custom function to verify P and R
def cal_P_R(y_test, y_pred):
    pos_test = np.sum(y_test == 1)
    neg_test = np.sum(y_test == 0)

    TP = np.sum(np.logical_and(y_pred == 1, y_test == 1))

    # True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
    TN = np.sum(np.logical_and(y_pred == 0, y_test == 0))

    # False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
    FP = np.sum(np.logical_and(y_pred == 1, y_test == 0))

    # False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
    FN = np.sum(np.logical_and(y_pred == 0, y_test == 1))
    
    P = (TP * 1.0) / (TP + FP)
    R = (TP * 1.0) / (TP + FN)

    print("P : ", P)
    print("R : ", R)
    print("TP : ", TP)
    print("FP : ", FP)
    print("TN : ", TN)
    print("FN : ", FN)

    
# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):
    print("Confusion Matrix: ",
          confusion_matrix(y_test, y_pred))

    print("Accuracy : ",
          accuracy_score(y_test, y_pred) * 100)

    print("Report : ",
          classification_report(y_test, y_pred))

In [3]:
# First use the training set and train your model
df_train = pd.DataFrame(training_data)
df_train_original = df_train.copy()

# Seperate labels
train_labels = df_train.iloc[:,1]

# Seperate train
df_train = df_train.iloc[:,2:]

# Split the dev set into train-test
df_train, df_test, train_labels, test_labels = train_test_split(
        df_train,train_labels, test_size=0.3, random_state=100)

# Train the models using the split
clf_gini = train_using_gini(df_train.values, train_labels)
clf_entropy = train_using_entropy(df_train.values, train_labels)
lr_model = train_using_logistic_regression(df_train.values, train_labels)
svm_model = train_using_svm_SVC(df_train.values, train_labels)
rf_model = train_using_random_forest(df_train.values, train_labels)
gaussian_nb_model = train_using_gaussian_nb(df_train.values, train_labels)

# Predict using the trained models in the dev set
print("Results Using Gini Index:")
# Prediction using gini
y_pred_gini = prediction(df_test.values, clf_gini)
cal_accuracy(test_labels, y_pred_gini)
cal_P_R(test_labels, y_pred_gini)

print("Results Using Entropy:")
# Prediction using entropy
y_pred_entropy = prediction(df_test.values, clf_entropy)
cal_accuracy(test_labels, y_pred_entropy)
cal_P_R(test_labels, y_pred_entropy)

print("Results Using Logistic Regression:")
# Prediction using Logistic Regression
y_pred_lr = prediction(df_test.values,lr_model)
cal_accuracy(test_labels, y_pred_lr)
cal_P_R(test_labels, y_pred_lr)

print("Results Using SVM:")
# Prediction Using SVM
y_pred_svm_SVC = prediction(df_test.values,svm_model)
cal_accuracy(test_labels, y_pred_svm_SVC)
cal_P_R(test_labels, y_pred_svm_SVC)

print("Results Using Random Forests:")
# Prediction Using Random Forests
y_pred_rf = prediction(df_test.values,rf_model)
cal_accuracy(test_labels, y_pred_rf)
cal_P_R(test_labels, y_pred_rf)

print("Results Using Gaussian NB:")
# Prediction using Gaussian NB
y_pred_gaussian_nb = prediction(df_test.values,gaussian_nb_model)
cal_accuracy(test_labels, y_pred_gaussian_nb)
cal_P_R(test_labels, y_pred_gaussian_nb)



Results Using Gini Index:
('Confusion Matrix: ', array([[15612,   212],
       [  205,  1346]]))
('Accuracy : ', 97.6)
('Report : ', u'              precision    recall  f1-score   support\n\n           0       0.99      0.99      0.99     15824\n           1       0.86      0.87      0.87      1551\n\n   micro avg       0.98      0.98      0.98     17375\n   macro avg       0.93      0.93      0.93     17375\nweighted avg       0.98      0.98      0.98     17375\n')
('P : ', 0.8639281129653402)
('R : ', 0.8678272082527402)
('TP : ', 1346)
('FP : ', 212)
('TN : ', 15612)
('FN : ', 205)
Results Using Entropy:
('Confusion Matrix: ', array([[15603,   221],
       [  212,  1339]]))
('Accuracy : ', 97.50791366906475)
('Report : ', u'              precision    recall  f1-score   support\n\n           0       0.99      0.99      0.99     15824\n           1       0.86      0.86      0.86      1551\n\n   micro avg       0.98      0.98      0.98     17375\n   macro avg       0.92      0.92     

In [4]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

# Do a 3 fold Cross Validation across all the models to chose the best
num_folds = 3
models = [clf_gini, clf_entropy, lr_model, rf_model, gaussian_nb_model, svm_model]
for classifier in models:
    print "-" * 30 
    print "Printing Average Stats for: " + str(type(classifier).__name__)
    scores = cross_val_score(classifier, df_train, train_labels, cv=num_folds, scoring='precision')
    P = np.mean(scores)
    print "Average Precision: " + str(P * 100)
    scores = cross_val_score(classifier, df_train, train_labels, cv=num_folds, scoring='recall')
    R = np.mean(scores)
    print "Average Recall: " + str(R * 100)
    F1 = 2*(P*R)/(P+R)
    print "F1 Score: "+ str(F1)

------------------------------
Printing Average Stats for: DecisionTreeClassifier
Average Precision: 83.89179398960799
Average Recall: 84.0877914951989
F1 Score: 0.8398967839808569
------------------------------
Printing Average Stats for: DecisionTreeClassifier
Average Precision: 84.66866867869915
Average Recall: 84.33470507544581
F1 Score: 0.845013569082505
------------------------------
Printing Average Stats for: LogisticRegression
Average Precision: 72.86023894743282
Average Recall: 47.59945130315501
F1 Score: 0.5758121058588133
------------------------------
Printing Average Stats for: RandomForestClassifier
Average Precision: 85.5730061199918
Average Recall: 85.37722908093279
F1 Score: 0.8547500549576418
------------------------------
Printing Average Stats for: GaussianNB
Average Precision: 38.46737597361036
Average Recall: 98.49108367626886
F1 Score: 0.5532617051197529
------------------------------
Printing Average Stats for: SVC
Average Precision: 80.57914134284374
Average R

In [5]:
# Since Random Forests are giving the best performance, we will pick this.
# We are going to do some rule-based post processing

In [6]:
# Calculate P,R and F1
words = pd.DataFrame(df_train_original[0])
words.columns = ['string']
test_labels = pd.DataFrame(test_labels)
test_labels.columns = ['actual_label']
df_test.columns = [
                   #'allCaps',
                   'nextPoss', 'numCap', 'numVow' , 'numCons',
                   'numChar', 'sumAscii', 'numberPres', 'isNoun',
                   'isDay', 'isArticle', 'isCity', 'isStart', 
                  'containsSports', 'isNoun_v2', 'isVerb_v2', 'isCountry'
                  ]

debug_df = df_test.join(test_labels).join(words)
debug_df['predicted_label'] = y_pred_rf

# Compute statistics
# false positives
false_pos = debug_df[(debug_df['predicted_label'] == 1) & (debug_df['actual_label'] == 0)]
# false negatives
false_neg = debug_df[(debug_df['predicted_label'] == 0) & (debug_df['actual_label'] == 1)]
# true positive
true_pos = debug_df[(debug_df['predicted_label'] == 1) & (debug_df['actual_label'] == 1)]
# true negative
true_neg = debug_df[(debug_df['predicted_label'] == 0) & (debug_df['actual_label'] == 0)]

# Precision
print "Precision before post processing: " + str(100.0*true_pos.shape[0]/(true_pos.shape[0]+false_pos.shape[0]))

# Recall
print "Recall before post processing: " + str(100.0*true_pos.shape[0]/(true_pos.shape[0]+false_neg.shape[0]))

#F1 Score
P = true_pos.shape[0]*1.0/(true_pos.shape[0]+false_pos.shape[0])
R = true_pos.shape[0]*1.0/(true_pos.shape[0]+false_neg.shape[0])
F1 = 2*(P*R)/(P+R)
print "F1 Score before post processing " + str(F1)

Precision before post processing: 88.4540117417
Recall before post processing: 87.4274661509
F1 Score before post processing 0.879377431907


In [7]:
# Our post processing rule is to ignore examples which have football related terms like 
# countries, football clubs,tournaments
football_terms = []
def read_football():
    with open('football.csv') as f:
        reader = csv.reader(f)

        for row in reader:
            football_terms.append(row[0].lower())

# Read all the terms
read_football()

# Perform post processing step
for index, row in debug_df.iterrows():
    if row['predicted_label'] == 1:
        for entry in football_terms:
            if entry in row['string'].lower():
                debug_df.loc[index, 'predicted_label'] = 0

# Recompute all the statistics
# false positives
false_pos = debug_df[(debug_df['predicted_label'] == 1) & (debug_df['actual_label'] == 0)]
# false negatives
false_neg = debug_df[(debug_df['predicted_label'] == 0) & (debug_df['actual_label'] == 1)]
# true positive
true_pos = debug_df[(debug_df['predicted_label'] == 1) & (debug_df['actual_label'] == 1)]
# true negative
true_neg = debug_df[(debug_df['predicted_label'] == 0) & (debug_df['actual_label'] == 0)]

# Precision
print "Precision after post processing: " + str(100.0*true_pos.shape[0]/(true_pos.shape[0]+false_pos.shape[0]))

# Recall
print "Recall after post processing: " + str(100.0*true_pos.shape[0]/(true_pos.shape[0]+false_neg.shape[0]))

#F1 Score
P = true_pos.shape[0]*1.0/(true_pos.shape[0]+false_pos.shape[0])
R = true_pos.shape[0]*1.0/(true_pos.shape[0]+false_neg.shape[0])
F1 = 2*(P*R)/(P+R)
print "F1 Score before post processing " + str(F1)

Precision after post processing: 93.285198556
Recall after post processing: 83.3010960671
F1 Score before post processing 0.880108991826


In [8]:
# Now that the precision and recall are meeting the required assignment constriants we can apply it on the 
# test set which has been untouched so far.

In [9]:
# Applying Model on Test Set J
df_final_test = pd.DataFrame(test_data)
df_final_test_original = df_final_test.copy()

#S eperate labels
test_labels = df_final_test.iloc[:,1]

# Separate data
df_final_test = df_final_test.iloc[:,2:]

# Perform preficions
y_final_pred_rf = prediction(df_final_test.values,rf_model)

# Append the predicted values to original Data Frame to perform post processing
words = pd.DataFrame(df_final_test_original[0])
words.columns = ['string']
test_labels = pd.DataFrame(test_labels)
test_labels.columns = ['actual_label']
df_final_test.columns = [
                   'nextPoss', 'numCap', 'numVow' , 'numCons',
                   'numChar', 'sumAscii', 'numberPres', 'isNoun',
                   'isDay', 'isArticle', 'isCity', 'isStart', 'containsSports',
                   'isNoun_v2', 'isVerb_v2', 'isCountry'
                  ]

debug_df_final = df_final_test.join(test_labels).join(words)

debug_df_final['predicted_label'] = y_final_pred_rf

# Perform the same post processing step as above  
for index, row in debug_df_final.iterrows():
    if row['predicted_label'] == 1:
        for entry in football_terms:
            if entry in row['string'].lower():
                debug_df_final.loc[index, 'predicted_label'] = 0
                
# Compute Final statistics on Test Set J
# false positives
false_pos = debug_df_final[(debug_df_final['predicted_label'] == 1) & (debug_df_final['actual_label'] == 0)]
# false negatives
false_neg = debug_df_final[(debug_df_final['predicted_label'] == 0) & (debug_df_final['actual_label'] == 1)]
# true positive
true_pos = debug_df_final[(debug_df_final['predicted_label'] == 1) & (debug_df_final['actual_label'] == 1)]
# true negative
true_neg = debug_df_final[(debug_df_final['predicted_label'] == 0) & (debug_df_final['actual_label'] == 0)]

# Precision
print "Final Precision on Test Set J: " + str(100.0*true_pos.shape[0]/(true_pos.shape[0]+false_pos.shape[0]))

# Recall
print "Final Recall on Test Set J: " + str(100.0*true_pos.shape[0]/(true_pos.shape[0]+false_neg.shape[0]))

#F1 Score
P = true_pos.shape[0]*1.0/(true_pos.shape[0]+false_pos.shape[0])
R = true_pos.shape[0]*1.0/(true_pos.shape[0]+false_neg.shape[0])
F1 = 2*(P*R)/(P+R)
print "F1 Score before post processing " + str(F1)

Final Precision on Test Set J: 91.5966386555
Final Recall on Test Set J: 81.8864774624
F1 Score before post processing 0.864698104892


In [10]:
# These results meet the expected Project requirements of at least 90% Precision and 60% Recall