## Imports

In [11]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import nltk
import nltk as nk
from nltk import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import FreqDist
from nltk.corpus import stopwords
import re
from sentistrength import PySentiStr 
import time
from dateutil import parser    


# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# machine learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# Accessing Files and Folders
# import os

### Dataset Loading

In [12]:
data1 = pd.read_json('dataset_presentation.json', lines=True)
data2 = pd.read_json('dataset_time.json',lines=True)[:50000] 

## Preprocessing

### Time Module

In [13]:
data2_list = data2.values.tolist()

answerCreationDate = []
questionCreationDate = []
resultantTime = []
for time in data2_list:
    answer_moment = parser.parse(time[0])
    question_moment = parser.parse(time[3])
    answerCreationDate.append(answer_moment.hour*answer_moment.minute*answer_moment.second)
    questionCreationDate.append(question_moment.hour*question_moment.minute*question_moment.second)
    calculatedTimeInSeconds = parser.parse(time[3]) - parser.parse(time[0])
    calculatedTimeInMinutes = calculatedTimeInSeconds.seconds/(60*60)
    if (calculatedTimeInMinutes >= 21.20):
        resultantTime.append(1)
    else:
        resultantTime.append(0) 

### Presentation Quality Module

In [14]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
REPLACE_BY_SPACE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS = re.compile('[^0-9a-z #+_]')
REMOVING_NUMBERS = re.compile("(^|\W)\d+")
STOPWORDS = set(stopwords.words('english'))
REPLACE_Tag= re.compile('<(div|/div|br|p|/p|code|/code|)[^>]{0,}>')
URL_Tag = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def getBody(text): 
    countUpper = 0
    wordCount = 0 
    word_text = REPLACE_Tag.sub('', text)
    filtered_word = URL_Tag.sub('',word_text)
    for word in filtered_word:
        if (word.isupper()):
            countUpper = countUpper + 1
        wordCount = wordCount + 1
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    code_Snippets= re.findall('<(code|/code)>', text)
    test = lambda x : 1 if len(x) > 1 else 0

    return(len(filtered_word),countUpper/len(filtered_word),len(urls), test(code_Snippets))

### Training Set for Presentation Quality

In [15]:
body = data1["Body"].tolist()
x = [getBody(sents) for sents in body]
data_df = pd.DataFrame(x,columns=['Number_of_Charecters','UpperCase_Ratio','URL_Count','Code_Snippets'])
trainingSet = pd.DataFrame(data=list(zip(data_df['Number_of_Charecters'].tolist(),data_df['UpperCase_Ratio'].tolist(),data_df['URL_Count'].tolist())))

In [16]:
y_labels = data_df['Code_Snippets'].tolist()
X_train, X_test, y_train, y_test = train_test_split(trainingSet,y_labels,test_size=0.30,random_state=30)

### Results

In [17]:
classifier_Multinomial = MultinomialNB()
classifier_BernoulliNB = BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
classifier_LogisticRegression = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=1)
classifier_RandomForestClassifier = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)
classifier_DecisionTreeClassifier = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0)
classifier_LinearSVC_Classifier = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,intercept_scaling=1, loss='squared_hinge', max_iter=1000,multi_class='ovr', penalty='l2', random_state=0, tol=1e-05, verbose=0)

kf = KFold(30,True,1)
classifier_List = [classifier_Multinomial,classifier_BernoulliNB,
                   classifier_LogisticRegression,classifier_RandomForestClassifier,
                   classifier_DecisionTreeClassifier,classifier_LinearSVC_Classifier]
classifiers_Score=[]
scoring = ['precision_macro', 'recall_macro','accuracy','f1_macro']

### Running cross validation on single classifier

scores = cross_validate(classifier_LogisticRegression, trainingSet, y_labels, cv=kf,scoring=scoring)
print("\n Accuracy : ",np.average(scores['test_accuracy']))
print(" Precision : ",np.average(scores['test_precision_macro']))
print(" Recall : ",np.average(scores['test_recall_macro']))
print(" F1_macro : ",np.average(scores['test_f1_macro']))
print("\n")

### Running cross validation on Multiple classifiers

# for classifier in classifier_List:
#     scores = cross_validate(classifier, trainingSet, y_labels, cv=kf,scoring=scoring)
# #     classifiers_Score.append(scores)
#     print("-"*10,classifier,"-"*10)
#     print("\n Accuracy : ",np.average(scores['test_accuracy']))
#     print(" Precision : ",np.average(scores['test_precision_macro']))
#     print(" Recall : ",np.average(scores['test_recall_macro']))
#     print(" F1_macro : ",np.average(scores['test_f1_macro']))
#     print("\n")


 Accuracy :  0.8052192082591885
 Precision :  0.7452707601695575
 Recall :  0.548322671559243
 F1_macro :  0.5383968136257881




### Training Set for Time

In [18]:
time_df = pd.DataFrame(data=list(zip(answerCreationDate,questionCreationDate)),columns=["answerCreationDate","questionCreationDate"])
y_labels_time = resultantTime
X_train_time, X_test_time, y_train_time, y_test_time = train_test_split(time_df,y_labels_time,test_size=0.30,random_state=30)

### Results

In [20]:
kf_time = KFold(40,True,1)
scores = cross_validate(classifier_LogisticRegression, time_df, y_labels_time, cv=kf_time,scoring=scoring)
print("\n Accuracy : ",np.average(scores['test_accuracy']))
print(" Precision : ",np.average(scores['test_precision_macro']))
print(" Recall : ",np.average(scores['test_recall_macro']))
print(" F1_macro : ",np.average(scores['test_f1_macro']))
print("\n")

### Running cross validation on Multiple classifiers

# for classifier in classifier_List: 
#     scores = cross_validate(classifier, time_df, y_labels_time, cv=kf,scoring=scoring)
#     print("-"*10,classifier,"-"*10)
#     print("\n Accuracy : ",np.average(scores['test_accuracy']))
#     print(" Precision : ",np.average(scores['test_precision_macro']))
#     print(" Recall : ",np.average(scores['test_recall_macro']))
#     print(" F1_macro : ",np.average(scores['test_f1_macro']))
#     print("\n")


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



 Accuracy :  0.72816
 Precision :  0.36408
 Recall :  0.5
 F1_macro :  0.4213258030709686




  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### Training Set for Reputation

In [21]:
trainingSetReputation = pd.DataFrame(data=list(zip(data_df['Number_of_Charecters'].tolist(),data_df['UpperCase_Ratio'].tolist(),data_df['URL_Count'].tolist(),answerCreationDate,questionCreationDate)))
y_labels_reputation = data2["Reputation"].tolist()
X_train_reputation, X_test_reputation, y_train_reputation, y_test_reputation = train_test_split(trainingSetReputation,y_labels_reputation,test_size=0.30,random_state=30)

### Results

In [22]:
scores = cross_validate(classifier_LogisticRegression, trainingSet, y_labels, cv=kf,scoring=scoring)
print("\n Accuracy : ",np.average(scores['test_accuracy']))
print(" Precision : ",np.average(scores['test_precision_macro']))
print(" Recall : ",np.average(scores['test_recall_macro']))
print(" F1_macro : ",np.average(scores['test_f1_macro']))
print("\n")
    
### Running cross validation on Multiple classifiers

# for classifier in classifier_List: 
#     scores = cross_validate(classifier, time_df, y_labels_time, cv=kf,scoring=scoring)
#     print("-"*10,classifier,"-"*10)
#     print("\n Accuracy : ",np.average(scores['test_accuracy']))
#     print(" Precision : ",np.average(scores['test_precision_macro']))
#     print(" Recall : ",np.average(scores['test_recall_macro']))
#     print(" F1_macro : ",np.average(scores['test_f1_macro']))
#     print("\n")


 Accuracy :  0.8052192082591885
 Precision :  0.7452707601695575
 Recall :  0.548322671559243
 F1_macro :  0.5383968136257881


