# <center> CSCI 6515 - Assignment - B00825788 <center>

In [1]:
#importing all the required libraries

import pandas as pd
import numpy as np
from zipfile import ZipFile
import os
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# Question 1,2)

In [0]:
#extracting zip files

def extractZipFiles():
    rootdir = './Data/'
    for file in os.listdir(rootdir):   
        if(file.endswith(".zip")):
            with ZipFile(os.path.join('./Data/', file), 'r') as zip: 
                zip.extractall('./extracted_files/')

extractZipFiles()

# Question 3)

In [5]:
# loading the dataframe with content extracted from xml files

def loadDataFrame():
    df = pd.DataFrame(columns=['Headline','Text','bip:topics','dc.date.published','itemid','XMLfilename'])
    unique_bip_topics = set()

    rootdir = './extracted_files/'
    for file in os.listdir(rootdir):
        fileName = file
        if(file.endswith(".xml")):
            file = open(os.path.join('./extracted_files/', file)).read()        
            file_contents = BeautifulSoup(file)
            headline = file_contents.headline.text
            text = file_contents.findAll("text")[0].text
            itemId = file_contents.newsitem["itemid"]        
            dc_date_published = file_contents.findAll(attrs={"element" : "dc.date.published"})[0]['value']
            bip_topics = file_contents.findAll(attrs={"class" : "bip:topics:1.0"})
            if(len(bip_topics) > 0):        
                bip_topics = bip_topics[0].findAll("code")  
                for val in bip_topics:
                    unique_bip_topics.add(val['code'])
                code = bip_topics[0]['code']
                df = df.append({'Headline': headline, 
                                   'Text': text, 
                                   'bip:topics': code,
                                   'dc.date.published': dc_date_published, 
                                   'itemid': itemId,
                                   'XMLfilename': fileName
                                  }, ignore_index=True)
            else:
                 df = df.append({'Headline': headline, 
                                   'Text': text, 
                                   'bip:topics': "",
                                   'dc.date.published': dc_date_published, 
                                   'itemid': itemId,
                                   'XMLfilename': fileName
                                  }, ignore_index=True)

    
    return df, unique_bip_topics
    

# Question 4)

In [6]:
# function to find all the possible values for bip:topics.

def getUniqueBipTopics():
    # call function to extract xml files and load data frame    
    output = loadDataFrame()    
    df = output[0]
    unique_bip_topics = output[1]
    return df, unique_bip_topics

In [7]:
#calling function to get all possible bip topics

output = getUniqueBipTopics()
df = output[0]
unique_bip_topics = output[1]
print(unique_bip_topics)

{'MCAT', 'CCAT', 'C16', 'C23', 'GPRO', 'G159', 'GWEA', 'E71', 'E11', 'GHEA', 'E143', 'GCAT', 'E51', 'GDIP', 'E411', 'GCRIM', 'C171', 'E511', 'C21', 'E21', 'C32', 'GENV', 'E41', 'E142', 'E512', 'C34', 'C331', 'E61', 'GTOUR', 'C183', 'E311', 'GSPO', 'G15', 'M13', 'M11', 'E131', 'E313', 'GPOL', 'C31', 'GODD', 'GOBIT', 'M142', 'G157', 'C312', 'E12', 'C33', 'GREL', 'GDIS', 'C17', 'GENT', 'C1511', 'G151', 'G155', 'E14', 'GDEF', 'C22', 'C311', 'GVOTE', 'M14', 'C12', 'G156', 'C11', 'GJOB', 'GWELF', 'C313', 'C174', 'C13', 'E121', 'C151', 'C411', 'M141', 'M131', 'C182', 'G153', 'C42', 'ECAT', 'E513', 'C173', 'M132', 'E141', 'G158', 'C41', 'E312', 'E13', 'C172', 'E31', 'GSCI', 'C181', 'E132', 'G152', 'GVIO', 'G154', 'GFAS', 'C18', 'E212', 'M143', 'C15', 'C152', 'C24', 'C14', 'M12', 'E211'}


In [8]:
# printing the number of unique bip topics

print(len(unique_bip_topics))

102


# Question 5)

In [3]:
# function to preprocess the text column of dataframe df

def textPreprocessing(param_df):
    
    # removing rows containing null values
    param_df = param_df.dropna()

    # remove special characters and numbers
    param_df.Text = param_df.Text.apply(lambda x: re.sub(r'[^a-z]', ' ', x.lower()))    
 
    #remove everything except nouns
    accepted_tags={'NN','NNS','NNP','NNPS'}
    param_df.Text = param_df.Text.apply(lambda x: " ".join([pair[0] for pair in nltk.pos_tag(word_tokenize(x)) if pair[1] in accepted_tags]))
      
    #remove stop words and lemmatization of words
    stopwords_list = set(stopwords.words("english"))
    lem = WordNetLemmatizer()
    param_df.Text = param_df.Text.apply(lambda x: " ".join([lem.lemmatize(word) for word in x.split() if word not in stopwords_list]))   
    
    #removing least common words whose frequency is less than or equal to 5
    obj = pd.Series(' '.join(df.Text).split()).value_counts()
    wrds = obj[obj<=5]
    least_freq = pd.Series(' '.join(df.Text).split()).value_counts()[-len(wrds):]
    least_freq = set(least_freq)
    df.Text = df.Text.apply(lambda x: " ".join(word for word in x.split() if word not in least_freq))
    
    return param_df

# Question 6)

In [4]:
# function to take the input dataframe and -
                  # call the preprocessing method to preprocess the text column of dataframe df 
                  # extract features from the text 
                  # and generate a new dataframe of features and labels

def extractFeatures(param):
    preprocessed_df = textPreprocessing(param)   
    
    #features
    vectorizer = TfidfVectorizer(max_features = 1000)
    X_train_counts = vectorizer.fit_transform(preprocessed_df.Text)           

    model_df = pd.DataFrame(X_train_counts.toarray(), columns=vectorizer.get_feature_names())

    #labels
    label_encoder = preprocessing.LabelEncoder()
    model_df["bip:topics:encoded"] = label_encoder.fit_transform(preprocessed_df['bip:topics'].astype(str))

    return model_df

# Question 7)



### Reason behind the method using to divide your data into a training and test set:

* To validate the performance of the model on unseen data, I have done both sample validation(test-train-split) and K-folds(cross validation with 5 folds) by splitting the entire data which separates the test data and train data. 


* Cross validation with 5 folds divides my whole data into 5 folds in which one fold is considered as test data and K-1 folds (4 folds) are considered as training data. This whole process is iterated 5 times in which, in each iteration, each fold gets to be taken as test data and remaining K-1 folds as training data. This way I could make sure that my models are not biased to some specific portion of data.


* Therefore, I initially checked the performance of model on training data and compare with the performance on unseen data or test data. If the model is performing well on that unseen data as well, there are still chances that the model is biased. To ensure the model is not biased, I performed cross validation with 5-folds in later stage.


In [5]:
#splitting the data into training and testing data

def getParameters(new_df):
    X_train, X_test, y_train, y_test = train_test_split(new_df.loc[:, new_df.columns != 'bip:topics:encoded'], new_df['bip:topics:encoded'], test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test
    

# Question 8)

In [6]:
# function to - 
#        - extract features from the dataframe, 
#        - get the required parameters to train a model and 
#        - return the trained model 

def getTrainedModel(model):
    try:
        new_df
        
    except: #extracting features only first time
        new_df = extractFeatures(df)
      
    finally:
    # if features are already extracted        
        output = getParameters(new_df)
        X_train = output[0]
        X_test = output[1]
        y_train = output[2]
        y_test = output[3]

        model_obj = model
        trained_model = model_obj.fit(X_train, y_train)
        return trained_model, X_test, y_test, new_df

#### Naive Bayes 

In [29]:
#Naive Baye's algorithm

multinomialNB=MultinomialNB()
output = getTrainedModel(multinomialNB)
trained_model = output[0]
X_test = output[1]
y_test = output[2]
new_df = output[3]
y_pred = trained_model.predict(X_test)
print("Naive Baye's classifier's Accuracy:", evaluateClassifier(y_test, y_pred))

Naive Baye's classifier's Accuracy: 0.7058986047796657


#### Naive Bayes - cross validation

In [22]:
# Evaluation - Cross validation

multinomialNB=MultinomialNB()
cv_nb = cross_val_score(multinomialNB, new_df.loc[:, new_df.columns != 'bip:topics:encoded'], new_df['bip:topics:encoded'], cv=5)
print("Cross validation score: ", cv_nb.mean())



Cross validation score:  0.7057379369196001


# Question 9)

### Evaluating the quality of your classifier:

* I have chosen Mean Squared Error(MSE) to evaluate the performance of Linear Regression model because in Mean Squared Error, the error will be calculated by taking the average of squares of absolute differences between the target values and the predictions. This is better than Mean Absolute Error (MAE) because it cancels out the negative and positive values and penalizes the errors extremely.

* For classification models, I have chosen F1 score to evaluate the performance of classification models - SVM, Decision Tree, Random Forest, and Neural Networks model. F1 score provides the mean of both recall and precision and performs better in evaluating the incorrectly classified cases than the metric accuracy. By taking mean, it gets to penalize these cases at an extreme level. Hence, I chose F1 score since it is better in evaluating False Negative cases compared to other metrics.

In [7]:
# function to evaluate the classifier using F1 score

def evaluateClassifier(y_test, y_pred):
    return f1_score(y_test, y_pred, average = "micro")

# Question 10)

## SVM 

In [8]:
#SVM algorithm - linear kernel

SVM_clf=SVC(kernel='linear')
output = getTrainedModel(SVM_clf)
trained_model = output[0]
X_test = output[1]
y_test = output[2]
new_df = output[3]
y_pred = trained_model.predict(X_test)
print("SVM classifier's Accuracy:", evaluateClassifier(y_test, y_pred))

SVM classifier's Accuracy: 0.7425749412902335


#### SVM - cross validation

In [9]:
# Evaluation - Cross validation

SVM_clf=SVC(kernel='linear')
cv_svm = cross_val_score(SVM_clf, new_df.loc[:, new_df.columns != 'bip:topics:encoded'], new_df['bip:topics:encoded'], cv=5)
print("Cross validation score: ", cv_svm.mean())



Cross validation score:  0.7282876782788512


#### SVM - Grid Search

In [10]:
# Grid Search

parameter_space = {
    'C' : [10, 100, 1000]
}
svc = SVC(kernel='linear')
gs_svc = GridSearchCV(svc, parameter_space, n_jobs=-1, cv=3)
gs_svc.fit(X_train, y_train)
print('Best parameters found:\n', gs_svc.best_params_)



Best parameters found:
 {'C': 10}


In [12]:
gs_svc.best_score_

0.7492827496373486


## Decision Tree Classifier

In [22]:
# Decision tree classifier

dt=DecisionTreeClassifier()
output = getTrainedModel(dt)
trained_model = output[0]
X_test = output[1]
y_test = output[2]
new_df = output[3]
y_pred = trained_model.predict(X_test)
print("Decision Tree classifier's Accuracy:", evaluateClassifier(y_test, y_pred))

Decision Tree classifier's Accuracy: 0.6301936195135396


#### Decision Tree - cross validation

In [23]:
# Evaluation - Cross validation

cv_dt = cross_val_score(dt, new_df.loc[:, new_df.columns != 'bip:topics:encoded'], new_df['bip:topics:encoded'], cv=5)
print("Cross validation score: ", cv_dt.mean())



Cross validation score:  0.6118990136990028


#### Decision Tree - Grid Search

In [25]:
param_grid = {
"criterion":['gini','entropy'],
"min_samples_leaf" : [1, 2, 4]
}
# Create a based model
rf = DecisionTreeClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = 2, verbose = 2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  18 out of  18 | elapsed:  2.7min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1, 2, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [26]:
grid_search.best_params_

{'criterion': 'gini', 'min_samples_leaf': 1}

In [27]:
grid_search.best_score_

0.615970704624653

## Random Forest Classifier

In [17]:
# Random Forest classifier

clf=RandomForestClassifier(n_estimators=100)
output = getTrainedModel(clf)
trained_model = output[0]
X_test = output[1]
y_test = output[2]
new_df = output[3]
y_pred = trained_model.predict(X_test)
print("Random Forest classifier's Accuracy:", evaluateClassifier(y_test, y_pred))

Random Forest classifier's Accuracy: 0.739268242265555


#### Random Forest - cross validation

In [18]:
# Evaluation - Cross validation

cv_rf = cross_val_score(clf, new_df.loc[:, new_df.columns != 'bip:topics:encoded'], new_df['bip:topics:encoded'], cv=5)
print("Cross validation score: ", cv_rf.mean())



Cross validation score:  0.7335551261914275


#### Random Forest - Grid Search

In [19]:
param_grid = {
    'n_estimators':[70, 150],
    'random_state':[42]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = 2, verbose = 2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:  2.0min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'n_estimators': [70, 150], 'random_state': [42]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [20]:
grid_search.best_params_

{'n_estimators': 150, 'random_state': 42}

In [21]:
grid_search.best_score_

0.7340086232354852

## Neural Networks

In [23]:
# Neural Networks - MLP classifier

Mlp_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
output = getTrainedModel(Mlp_clf)
trained_model = output[0]
X_test = output[1]
y_test = output[2]
new_df = output[3]
y_pred = trained_model.predict(X_test)
print("Neural networks classifier's Accuracy:", evaluateClassifier(y_test, y_pred))

Neural networks classifier's Accuracy: 0.4446056085094626


#### Neural Networks - cross validation

In [24]:
# Evaluation - Cross validation

cv_nn = cross_val_score(Mlp_clf, new_df.loc[:, new_df.columns != 'bip:topics:encoded'], new_df['bip:topics:encoded'], cv=5)
print("Cross validation score: ", cv_nn.mean())



Cross validation score:  0.4486571870075339


#### Neural Networks - Grid Search

In [29]:
parameter_space = {
    'hidden_layer_sizes': np.arange(5, 10),
    'solver': ['adam'],
    'alpha': [0.2]
}
mlp = MLPClassifier()
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)
print('Best parameters found:\n', clf.best_params_)



Best parameters found:
 {'alpha': 0.2, 'hidden_layer_sizes': 9, 'solver': 'adam'}




In [30]:
clf.best_score_

0.7312531454453951

## Linear Regression

In [31]:
# Linear Regression algorithm

regressor = LinearRegression()  
output = getTrainedModel(regressor)
trained_model = output[0]
X_test = output[1]
y_test = output[2]
new_df = output[3]
y_pred = trained_model.predict(X_test)
print("Linear Regressor's Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))

Linear Regressor's Mean Squared Error: 188.4527022608793


#### Linear Regression - cross validation

In [26]:
# Evaluation - Cross validation

cv_lr = cross_val_score(regressor, new_df.loc[:, new_df.columns != 'bip:topics:encoded'], new_df['bip:topics:encoded'], cv=5)
print("Cross validation score: ", cv_lr.mean())

Cross validation score:  0.6802368053742873


#### Linear Regression - Grid Search

In [27]:
parameter_space = {
    'normalize':[True,False],
    'copy_X':[True,False],
    'fit_intercept':[True,False]
}
lr = LinearRegression()
gs_lr = GridSearchCV(lr, parameter_space, n_jobs=-1, cv=3)
gs_lr.fit(X_train, y_train)
print('Best parameters found:\n', gs_lr.best_params_)

Best parameters found:
 {'copy_X': True, 'fit_intercept': True, 'normalize': True}


In [28]:
gs_lr.best_score_

0.6829329411677455

# Best Classifier - SVM:

Among all the five models - Linear Regression, SVM, Decision Trees, Random Forest and Neural Networks, SVM with linear kernel is found to be a best classifier.

* <b> Before Grid Search </b> :
    Initially, using the train-test-split method, an F1 score of 0.742 is resulted, whereas with the cross validation method, cross validation score is found to be 0.728. I tried to improve the results using Hyper Parameter Tuning technique. 

* <b> After Grid Search: </b>
    I have used the Grid Search technique to help me find the optimal values for hyper parameters that provides best accurate results. With this approach, I tried with various values of C which is the penalty parameter of the error term. The best_params_ attribute of Grid search showed the best hyper parameters that suggests the values for parameter C and the attribute "best_score_" of the grid search provided the mean cross-validated score. Here, I got the cross validated score of 0.749 which clearly showed a minor improvement of 2% compared to the performance of SVM Classifier before Grid Search (0.728). 

Moreover, since SVM with linear kernel performs better in most of the text classification problems especially when the data distribution is imbalanced, considering the performance of it on our data, I would choose SVM is the best classifier irrespective of the model training time. 

However, if training times are to be considered and more number of features are to be included in future to train the model on, Random Forest classifier will also be a good choice since its training times are comparitively faster than SVM.

# References

[1] Python, A. (2019). A Comprehensive Guide to Understand and Implement Text Classification in Python. [online] Analytics Vidhya. Available at: https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/ [Accessed 24 Oct. 2019].

[2] Hintermeier, A. and Buffalo, N. (2019). Feature Extraction from Text. [online] Home. Available at: https://andhint.github.io/machine-learning/nlp/Feature-Extraction-From-Text/ [Accessed 24 Oct. 2019].

[3] K. N. K. Nguyen, “Pandas How to filter a Series,” Stack Overflow, 01-May-1965. [Online]. Available: https://stackoverflow.com/questions/28272137/pandas-how-to-filter-a-series. [Accessed: 24-Oct-2019].

[4] A. Verma, “Feature Extraction from Text (text data preprocessing),” Medium, 06-Jun-2019. [Online]. Available: https://medium.com/100-days-of-ml-and-code/feature-extraction-from-text-text-data-preprocessing-594b11af19f5. [Accessed: 24-Oct-2019].

[5] “5.2. Feature extraction¶,” scikit. [Online]. Available: https://scikit-learn.org/stable/modules/feature_extraction.html. [Accessed: 24-Oct-2019].

[6] user1599325, user1599325user1599325 1711 gold badge22 silver badges99 bronze badges, and MerlinMerlin 10.2k3030 gold badges8686 silver badges166166 bronze badges, “IOPub data rate exceeded in Jupyter notebook (Version 5.4.0),” Stack Overflow, 01-Nov-1968. [Online]. Available: https://stackoverflow.com/questions/51697516/iopub-data-rate-exceeded-in-jupyter-notebook-version-5-4-0. [Accessed: 24-Oct-2019].

[7] freeCodeCamp.org, “How to extract keywords from text with TF-IDF and Python's Scikit-Learn,” freeCodeCamp.org, 28-Feb-2019. [Online]. Available: https://www.freecodecamp.org/news/how-to-extract-keywords-from-text-with-tf-idf-and-pythons-scikit-learn-b2a0f3d7e667/. [Accessed: 24-Oct-2019].

[8] “Text Analytics for Beginners using NLTK,” DataCamp Community. [Online]. Available: https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk. [Accessed: 24-Oct-2019].

[9] “Stemming and Lemmatization in Python,” DataCamp Community. [Online]. Available: https://www.datacamp.com/community/tutorials/stemming-lemmatization-python#targetText=Stemming and Lemmatization both generate,words which makes it faster. [Accessed: 24-Oct-2019].