In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.feature_extraction import text
import nltk
import re
import math
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christinefang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/christinefang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [102]:
df = pd.read_csv('../../../sample_events.csv')
##satp = satp[["description", "relevant_event"]]
df = df[["description", "relevant_event", "internal_conflict", "multiple_events", "naxalite_maoist"]]
##df = df.replace()
df = df.drop_duplicates(subset=["description"]).dropna().reset_index(drop=True)
df["relevant_event"] = df["relevant_event"].replace(r'[2-9]{1,4}', np.nan, regex=True).fillna(0)
df["internal_conflict"] = df["internal_conflict"].replace(r'[2-9]{1,4}', np.nan, regex=True).fillna(0)
df["multiple_events"] = df["multiple_events"].replace(r'[2-9]{1,4}', np.nan, regex=True).fillna(0)
df["naxalite_maoist"] = df["naxalite_maoist"].replace(r'[2-9]{1,4}', np.nan, regex=True).fillna(0)

df.index = range(len(df))
df.head(10)

Unnamed: 0,description,relevant_event,internal_conflict,multiple_events,naxalite_maoist
0,Nagaland Post reports that a National Socialis...,1.0,1.0,0.0,0.0
1,"On February 22, Minister of State for Home, S....",0.0,0.0,0.0,0.0
2,Kanglaonline reports on June 21 that a militan...,1.0,1.0,0.0,0.0
3,Two security personnel were seriously injured ...,1.0,1.0,0.0,0.0
4,The Union Ministry of Home Affairs (UMHA) has ...,1.0,1.0,0.0,1.0
5,A woman Communist Party of India-Maoist (CPI-M...,1.0,1.0,0.0,1.0
6,Militants lobbed a hand grenade targeting comp...,1.0,1.0,0.0,0.0
7,Students under Kangleipak Students' Associatio...,1.0,1.0,0.0,0.0
8,"On May 5, an IED blast was reported in front o...",1.0,1.0,0.0,0.0
9,11 civilians were injured as unidentified terr...,1.0,1.0,0.0,0.0


# Cleaning and Tokenizing

In [103]:
data_stripped = []
data_tokenized_words = []

wl = WordNetLemmatizer()

for i in range(len(df)):
    # removes unnecessary characters
    row = re.sub('[^a-zA-Z0-9. ]', '', df['description'][i])
    row = row.lower()
    # tokenization
    row = row.split()
    # lemmatization and stop words removal
    row = [wl.lemmatize(word) for word in row if not word in set(stopwords.words('english'))]
    row2 = re.sub('[^a-zA-Z0-9 ]', '', ' '.join(row))
    row2 = row2.split(' ')
    row3 = ' '.join(row).split('.')
    
    
    data_stripped.append(' '.join(row))
    data_tokenized_words.append(row2)


df['stripped_description'] = data_stripped
df['word_tokenized_description'] = data_tokenized_words

In [104]:
# Define function for cross-validation
def cross_validation(folds, Y, accuracy_metric, a):
    # Split dataset into folds
    kf = KFold(n_splits=folds, shuffle=True)
    accuracy = []
    # SVM model
    nb = MultinomialNB(alpha = a)
    vectorizer = text.TfidfVectorizer(analyzer=lambda x: x,)
    X = vectorizer.fit_transform(df["word_tokenized_description"])

    # Cross-validation
    for train_idx, valid_idx in kf.split(X):
        X_train, X_valid = X[train_idx, :], X[valid_idx, :]
        Y_train, Y_valid = Y[train_idx], Y[valid_idx]
        nb.fit(X_train,Y_train)
        predictions = nb.predict(X_valid)
        accuracy.append(accuracy_metric(predictions, Y_valid))
    return np.mean(accuracy)

In [95]:
def precision_score_zero_div(predictions, Y_valid):
    return precision_score(predictions, Y_valid, zero_division=np.nan)

def recall_score_zero_div(predictions, Y_valid):
    return recall_score(predictions, Y_valid, zero_division=np.nan)

def accuracies(Y, alpha):
        accuracy_cv = pd.DataFrame(columns = ['folds', 'validation_acc', 'balanced_acc', 'precision', 'recall'])

        metrics = [accuracy_score, balanced_accuracy_score, precision_score_zero_div, recall_score_zero_div]

        for folds in range(5,21,5):
                accuracies = [folds]
                for i in range(len(metrics)):
                        mean_accuracy = cross_validation(folds, Y, metrics[i], alpha)
                        accuracies.append(mean_accuracy)
                accuracy_cv = pd.concat([pd.DataFrame([accuracies], columns=accuracy_cv.columns), accuracy_cv], ignore_index=True)

        accuracy_cv['folds'] = accuracy_cv['folds'].astype('int')
        accuracy_cv.set_index('folds')
        return accuracy_cv

In [105]:
accuracies(df["internal_conflict"], 1)
accuracies(df["multiple_events"], 1)
accuracies(df["naxalite_maoist"], 1)
accuracies(df["relevant_event"], 1)


  accuracy_cv = pd.concat([pd.DataFrame([accuracies], columns=accuracy_cv.columns), accuracy_cv], ignore_index=True)
  accuracy_cv = pd.concat([pd.DataFrame([accuracies], columns=accuracy_cv.columns), accuracy_cv], ignore_index=True)
  accuracy_cv = pd.concat([pd.DataFrame([accuracies], columns=accuracy_cv.columns), accuracy_cv], ignore_index=True)
  accuracy_cv = pd.concat([pd.DataFrame([accuracies], columns=accuracy_cv.columns), accuracy_cv], ignore_index=True)


Unnamed: 0,folds,validation_acc,balanced_acc,precision,recall
0,20,0.913859,0.914221,1.0,0.914221
1,15,0.913978,0.913978,1.0,0.913978
2,10,0.91383,0.9142,1.0,0.913969
3,5,0.913978,0.913978,1.0,0.913978


# Hyperparameter (alpha) tuning

In [112]:
def alpha_accuracies(col):
    accuracy = []
    balanced= []
    recall = []
    precision =  []

    metrics = [accuracy_score, balanced_accuracy_score, precision_score_zero_div, recall_score_zero_div]

    for i in [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]:
        accuracies = []
        for j in range(len(metrics)):
            accuracies.append(cross_validation(5, col, metrics[j], i))
        accuracy.append(accuracies[0])
        balanced.append(accuracies[1])
        recall.append(accuracies[2])
        precision.append(accuracies[3])

    data = {'alpha_param': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000], 
            'accuracy_score': accuracy,
            'balanced_accuracy': balanced,
            'precision': precision,
            'recall': recall} 

    return data

"UserWarning: y_pred contains classes not in y_true" means predicted labels did not contain 0

In [113]:
data = alpha_accuracies(df["relevant_event"])
pd.DataFrame(data)



Unnamed: 0,alpha_param,accuracy_score,balanced_accuracy,precision,recall
0,1e-05,0.911828,0.660499,0.923456,0.988398
1,0.0001,0.909677,0.723485,0.934833,0.976781
2,0.001,0.905376,0.640664,0.944324,0.940806
3,0.1,0.909677,0.705862,0.929887,0.967521
4,1.0,0.913978,0.913978,0.913978,1.0
5,10.0,0.913978,0.913978,0.913978,1.0
6,100.0,0.913978,0.913978,0.913978,1.0
7,1000.0,0.913978,0.913978,0.913978,1.0


In [114]:
data = alpha_accuracies(df["internal_conflict"])
pd.DataFrame(data)



Unnamed: 0,alpha_param,accuracy_score,balanced_accuracy,precision,recall
0,1e-05,0.877419,0.685284,0.898497,0.975901
1,0.0001,0.877419,0.759234,0.901251,0.963215
2,0.001,0.870968,0.716978,0.914906,0.934514
3,0.1,0.886022,0.690019,0.913545,0.958472
4,1.0,0.883871,0.883871,0.883871,1.0
5,10.0,0.883871,0.883871,0.883871,1.0
6,100.0,0.883871,0.883871,0.883871,1.0
7,1000.0,0.883871,0.883871,0.883871,1.0


In [101]:
data = alpha_accuracies(df["naxalite_maoist"])
pd.DataFrame(data)



Unnamed: 0,alpha_param,accuracy_score,balanced_accuracy,precision,recall
0,1e-05,0.890323,0.878797,0.84408,0.690985
1,0.0001,0.883871,0.86906,0.807265,0.705998
2,0.001,0.883871,0.866566,0.792712,0.779193
3,0.1,0.898925,0.883742,0.843208,0.857237
4,1.0,0.746237,0.870952,0.933333,0.116948
5,10.0,0.722581,0.722581,,0.0
6,100.0,0.722581,0.722581,,0.0
7,1000.0,0.722581,0.722581,,0.0
