## Author : Jasim Ahmed

## Importing Necessary Libraries

In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import nltk
import nltk as nk
from nltk import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import FreqDist
from nltk.corpus import stopwords
import re
from mlxtend.evaluate import confusion_matrix
import mlxtend

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn import metrics 

# Accessing Files and Folders
# import os

### Installing NLTK and mlxtend

In [None]:
# ! pip install -U nltk
# !pip install mlxtend

# nltk.download('punkt')
# nltk.download("popular")

### Reading JSON and removing null instances in rows

### Downloading the Dataset

In [None]:
# ! git clone https://lfs.aminer.cn/misc/dblp.v11.zip

### Reading the Dataset and dropping rows on basis of NAN value occurence

In [None]:
data = pd.read_json("dblp.v11.json",lines=True)
data.dropna(axis=0,how="any",inplace=True)

### Basic Statistics

In [None]:
data.describe(include="all")

### Verify whether all the venues are consistent with two values (id and raw)

In [None]:
venRaw = 0
for count,entry in enumerate(data["venue"]):
    if ("raw" not in entry.keys()):
        print(count)
        venRaw = count

data.drop(index=data.index[venRaw],axis=0,inplace=True)

In [None]:
# get tokenized form of the abstract(tokenization at word level in a sentence) and the original abstract.
def Tokenized_and_OriginalAbstract(data):
    """
        Input data: String
        return: tokenized words at sentence level and other is a String 
    """
    x = lambda x:x["InvertedIndex"].keys()
    tokenized_abstract = [list(x(entry)) for entry in data] # tokeized form of the abstract content
    original_abstract = [" ".join(entry) for entry in tokenized_abstract]
    return (tokenized_abstract,original_abstract)

# get venues from the dataset
def Venues(data):
    """
        Input data: list of dictionary of raw venue and their ids
        return: venues as a string 
    """
    x = lambda x:x["raw"]
    return [x(entry) for entry in data]

# Authors and Field of Study(Keywords)
def Author_and_FOS_Values(data):
    """
        Input data: String
        return: authors and FOS as string 
    """
    filtered_data = []
    for entry in data:
        valueInEntry = []
        for value in entry:
            valueInEntry.append(value["name"])
#       filtered_data.append(valueInEntry)
        filtered_data.append(", ".join(valueInEntry))
    return filtered_data 

### Initilazations and value setting
authors = Author_and_FOS_Values(data["authors"]) # Author Names of a particulat research paper
keywords = Author_and_FOS_Values(data["fos"]) # Field of Study of research paper
abstract = Tokenized_and_OriginalAbstract(data["indexed_abstract"]) # Abstract of the research paper
title = data["title"] # title of the research paper
year = data["year"] # year in which the paper was published
venue = Venues(data["venue"])

data = pd.DataFrame(data=list(zip(authors,keywords,abstract[1],title,year,venue)),columns=["authors","fos","abstract","title","year","venue"])

#### In order to save the data for later use because of its volume and variety

In [None]:
data.to_csv("dblp_paper_total.csv",index=False)

### Loading the saved data as CSV, merely a checkpoint

In [None]:
#### pd.read_csv("dblp_paper_total.csv",error_bad_lines=False) # if bad lines occurs
# data = pd.read_csv("dblp_paper_total.csv")

In [None]:
data

# Pre-Processing

### Using the venues as a center attribute to group and extract papers that are in a particular venue

In [None]:
topVenues = data.groupby(by="venue").size().sort_values(ascending=False)[:200]
venues_To_Extract = topVenues.index.to_list()[:60]

#### According to the plot the elbow of the venue is some where around 35 and 55. This value will help later on in classifying the papers in a particular venue

In [None]:
plt.plot(topVenues.values)
plt.ylabel('Number of Published Papers')
plt.xlabel('Top Venues')
plt.show()

#### Saving the optimal venue 

In [None]:
# venueList = pd.read_csv("venueList_To_Extract.csv")
# venues_To_Extract = ["".join(entry) for entry in venueList.values.tolist()]

In [None]:
cummulative_data = pd.DataFrame()
all_data = pd.DataFrame()
for venue in venues_To_Extract:
    temp_df = data.iloc[data.index[data["venue"] == venue].tolist()].reset_index(drop=True)
    all_data = all_data.append(temp_df,ignore_index=True,sort=False)

cummulative_data = cummulative_data.append(all_data)

#### randomly shuffle the data and reset index

In [None]:
cummulative_data = cummulative_data.sample(frac=1).reset_index(drop=True)
cummulative_data

#### Extracting and saving all the research papers that exist in the list of venue_To_Extract. So, that are data is skewed and there are no outliers(records of research papers) of any other non-existing venue.

In [None]:
# save_cummulative_data = pd.DataFrame(data=list(zip(cummulative_data["authors"],cummulative_data["fos"],cummulative_data["abstract"],cummulative_data["title"],cummulative_data["year"],cummulative_data["venue"])),columns=["authors","fos","abstract","title","year","venue"])
# save_cummulative_data.to_csv("cummulative_data_150k.csv",index=False)

In [None]:
# check whether venues exit in that number or not
# all_data.iloc[all_data[all_data["venue"] == "international conference on parallel processing"].index.tolist()]

# # check whether the collection of the data from the dataset is right
# cummulative_data.groupby(by="venue").size().sort_values(ascending=False)

### Basic Statistics on the subset of dataset that is normailzed

In [None]:
# check the original data as well as the cummulative data for inconsistencies and NAN values
data.iloc[data.index[data["venue"] == "international conference on parallel processing"].tolist()].reset_index(drop=True)
# data.iloc[data.loc[pd.isnull(data).any(1), :].index.values]

In [None]:
cummulative_data.iloc[cummulative_data.loc[pd.isnull(cummulative_data).any(1), :].index.values]

#### Basic statistics on the categorical data to find the unique and most occurred values

In [None]:
cummulative_data.describe(include="object")

### Distribution of data over the year.

In [None]:
plt.figure(figsize=(10,6))
cummulative_data['year'].plot(linewidth=1.5);

In [None]:
plt.hist(cummulative_data["year"]) # distribution of data over the year using histogram

In [None]:
cummulative_data.boxplot() # check whether the outliers occur in which particular window(years) using Boxplot

#### Number of research papers in a particular venue

In [None]:
venuesList = data.groupby(by="venue").size().sort_values(ascending=False)[:100]
venuesList

#### Which year had which papers and at which venues

In [None]:
data.groupby(by=["year","venue"]).size()[::-1][:20]

#### In a particular venue how many papers were published given an year

In [None]:
data.groupby(by=["year","venue"]).size().sort_values(ascending=False)

In [None]:
def NumberOfPapersInVenue_OR_FOS(fromTime, toTime):
    """
        Description: To find the number of research papers published in a particular time frame either in a venue 
        or field of study
        Input fromTime and toTime: Number Int
        return: text string with its numberic count
    """
    time = cummulative_data["year"].apply(lambda x: x > fromTime and x < toTime )
    return cummulative_data["venue"][time].value_counts()[:15]

In [None]:
NumberOfPapersInVenue_OR_FOS(1903,2008)

In [None]:
plt.figure(figsize=(12,9))
NumberOfPapersInVenue_OR_FOS(1900,2000).plot(kind="barh",title="Number Of Papers Published In a particular Venue") #plot

####  Authors with the most number of published papers

In [None]:
plt.figure(figsize=(14,6))
data["authors"].value_counts()[:10].plot(kind="barh",title="Authors having the most number of published papers") #plot
## another way to do this
# data.groupby(by="authors").size().sort_values(ascending=False)

#### Time and the venue of particular authors who were the most active

In [None]:
mask = np.in1d(cummulative_data["authors"],["John K. Debenham"]) # when the papers were published and in which venue by people who were most active
entries = cummulative_data.index[mask]
cummulative_data.iloc[entries]
## or in one line
# df = data.iloc[data.index[data["authors"] == "John K. Debenham"]]

#### Most research papers published in a particular field of study (fos)

In [None]:
data["fos"].value_counts()[:11]

In [None]:
plt.figure(figsize=(12,7))
cummulative_data["fos"].value_counts()[:11].plot(kind="barh",title="Most papers published in a particular field of study")

#### The Paper published in the field of Philosophy, Performance art having venue types

In [None]:
philosophy = (data["fos"] == "Multimedia, Human–computer interaction, Computer science")
philo = data[philosophy]
philo["venue"].value_counts()[:5]

## Data preperation - Feature Extraction

In [None]:
lemmatizer = WordNetLemmatizer() # For word lemmatization
stemmer = PorterStemmer() # For word Stemming
REPLACE_BY_SPACE = re.compile('[/(){}\[\]\|@,;]') 
BAD_SYMBOLS = re.compile('[^0-9a-z #+_]')
REMOVING_NUMBERS = re.compile("(^|\W)\d+")
STOPWORDS = set(stopwords.words('english'))

def Nltk2Word_And_Tag(nltk_tag):
    """
        Input text: a string
        return: string tag such as 'a','v','n','r'
    """
    if nltk_tag.startswith('J'):
        return wordnet.wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.wordnet.ADV
    else:        
        return None
    
def Lemmatize_Sentence(sentence):
    """
        Input text: a string 
        return: lemmatized string
    """
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    w_n_tagged = map(lambda x: (x[0], Nltk2Word_And_Tag(x[1])), nltk_tagged)
    res_words = []
    for word, tag in w_n_tagged:
        if tag is None: 
            res_words.append(word)
        else:
            res_words.append(lemmatizer.lemmatize(word, tag))
    return " ".join(res_words)

def Stem_Sentence(sentence):
    """
        Input text: a string
        Description: Can be applied for varying the analysis of Classifier
        return: stemmed string
    """
    tokenize_words = nltk.word_tokenize(sentence)
    stem_sents = [stemmer.stem(word) for word in tokenize_word]
    return " ".join(stem_sents)

def Length_Words_Disapproved(sentence,length):
    """
        Input text: a string and int for defining the limit on the length of the words that will be allowed
        return: modified text string
    """
    tokenize_words = nltk.word_tokenize(sentence)
    sent = list(filter(lambda x: len(x) > length,tokenize_words))
    return " ".join(sent) 

def Clean_Text(text,flag):
    """
        Input text: a string and flag for stemming on the current text
        return: modified text string which is lower-cased
    """
    text = text.lower() # lowercase text
    text = REMOVING_NUMBERS.sub(" ", text) # removes the occurences of number such as 2019 or 3valued or 21
    text = BAD_SYMBOLS.sub(" ", text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = REPLACE_BY_SPACE.sub(" ", text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text 
    if(flag):
        text = Stem_Sentence(text)
    
    return Lemmatize_Sentence(Length_Words_Disapproved(text,3))

In [None]:
lemmatized_abstract = cummulative_data["abstract"].apply(lambda x:Clean_Text(x,0))
lemmatized_title = cummulative_data["title"].apply(lambda x:Clean_Text(x,0))
lemmatized_fos = cummulative_data["fos"].apply(lambda x:Clean_Text(x,0))

# Analysis

### Building Vocabulary

In [None]:
vocabulary = nltk.word_tokenize(" ".join([" ".join(entry) for entry in np.column_stack((lemmatized_title,lemmatized_abstract)).tolist()]))

###  Finding most common terms in context

In [None]:
fdist = FreqDist(vocabulary)
fdist.plot(5000, cumulative=True)

In [None]:
fdist = FreqDist(vocabulary)
words = fdist.most_common(5000)
vocabulary_terms = []

for word_tuple in words:
    if(len(word_tuple[0]) > 3):
        vocabulary_terms.append(word_tuple[0])

### Building Documents

In [None]:
document = [" ".join(entry) for entry in np.column_stack((lemmatized_title,lemmatized_abstract,lemmatized_fos)).tolist()]

### TF-IDF Vectorizer using Vocabulary and Document

In [None]:
vectorizer = TfidfVectorizer()
vectorizer = vectorizer.fit(vocabulary_terms)
mat = vectorizer.transform(document)

In [None]:
TFIDF = pd.DataFrame(mat.todense(),columns = vectorizer.get_feature_names())
TFIDF

### Feature Extraction using Train Test Split

In [None]:
y_labels = cummulative_data["venue"].tolist()
X_train, X_test, y_train, y_test = train_test_split(TFIDF, y_labels, test_size=0.10, random_state=20)

## Classifier Training and Evaluation

### Linear SVC

In [None]:
LinearSVC_clf = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0)
LinearSVC_clf.fit(X_train, y_train)  

In [None]:
# print(clf.coef_)
# print(clf.intercept_)
y_predict_LinearSVC = LinearSVC_clf.predict(X_test)
cm = metrics.confusion_matrix(y_test,y_predict_LinearSVC)
print("classification score:", LinearSVC_clf.score(X_test,y_test))

# print("accuracy:", metrics.accuracy_score(y_test,y_predict_LinearSVC))
# print("precision:", metrics.precision_score(y_test,y_predict_LinearSVC, average="micro"))
# print("recall:", metrics.recall_score(y_test,y_predict_LinearSVC, average="micro"))


#### Confusion Matrix

In [None]:
plt.figure(figsize=(10,8))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.afmhot)
classNames = list(set(y_labels))
ax = %matplotlib.plt.ad(111)
cax = ax.matshow(cm)
fig.colorbar(cax)

plt.title('SVM RBF Kernel Confusion Matrix - Test Data')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, rotation=45)
plt.yticks(tick_marks)
s = [['TN','FP'], ['FN', 'TP']]
plt.figure()


### Using Cross Validation KFold and RepeatedKFold

In [None]:
# Initilization
classifier_Multinomial = MultinomialNB()
classifier_LogisticRegression = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=1)
classifier_RandomForestClassifier = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
classifier_LinearSVC_Classifier = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0)

# Setting values for Kfold and repeated Kflod
kf = KFold(10,True,1)
random_state = 300
# rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None) 
rkf = RepeatedKFold(n_splits=10, n_repeats=2, random_state=random_state)

classifier_List = [classifier_Multinomial,
                   classifier_LogisticRegression,
                   classifier_RandomForestClassifier,
                   classifier_LinearSVC_Classifier]

classifiers_Score=[]
scoring = ['precision_macro', 'recall_macro','accuracy','f1_macro']

###  Accuracy, Precision, Recall and F1_Score

In [None]:
for classifier in classifier_List:
    scores = cross_validate(classifier, TFIDF, y_labels, cv=kf,scoring=scoring)
#     classifiers_Score.append(scores)
    print("-"*10,classifier,"-"*10)
    print("\n Accuracy : ",np.average(scores['test_accuracy']))
    print(" Precision : ",np.average(scores['test_precision_macro']))
    print(" Recall : ",np.average(scores['test_recall_macro']))
    print(" F1_macro : ",np.average(scores['test_f1_macro']))
    print("\n")