In [2]:
##Feature Extraction and model building

In [56]:
#importing required libraries
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import re
import math
!pip install TextBlob
from textblob import TextBlob
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split




In [57]:
# load the dataset
train_df = pd.read_csv('python_comments_posts.csv')#dataset for training "python tags"
test_df = pd.read_csv('C:\\Users\\om\\Downloads\\comments_1221.csv')#dataset for testing "java tags" from replicated paper

test_df = test_df.rename(columns={
    'questionID': 'QuestionId',
    'answerID': 'answer_Id',
    'commentID': 'commentId',
    'UserId': 'OwnerUserId',
    'CreationDate': 'CreationDate',
})


In [58]:
#Feature Extraction of Text

In [59]:
#defining features
def extract_features(df):
    
    users = dict(zip(train_df['OwnerUserId'], train_df['Score']))

    # compute the number of comments on each post
    post_comment_count = df.groupby('QuestionId')['commentId'].count().to_dict()

    # compute the score of each post
    Score = dict(zip(df['QuestionId'], train_df['Score']))

    # Extract the comment features
    df['comment_score'] = np.where(df['Score']>0, df['Score'], 0)
    df['comment_order'] = df.groupby('QuestionId')['CreationDate'].rank(ascending=True)

    # Extract the user features
    df['by_asker'] = df['OwnerUserId'] == df['QuestionId']
    df['by_answerer'] = df['OwnerUserId'] == df['answer_Id']
    df['by_not_seen_commenter'] = (df['OwnerUserId'] != df['QuestionId']) & \
                                   (df['OwnerUserId'] != df['answer_Id']) & \
                                   ~(df['OwnerUserId'].isin(df.loc[df['QuestionId']==df['QuestionId'].shift(), 'OwnerUserId'])) & \
                                   ~(df['OwnerUserId'].isin(df.loc[df['answer_Id']==df['answer_Id'].shift(), 'OwnerUserId']))
    df['by_seen_commenter'] = (df['OwnerUserId'] != df['QuestionId']) & \
                               (df['OwnerUserId'] != df['answer_Id']) & \
                               (df['OwnerUserId'].isin(df.loc[df['QuestionId']==df['QuestionId'].shift(), 'OwnerUserId'])) & \
                               ~(df['OwnerUserId'].isin(df.loc[df['answer_Id']==df['answer_Id'].shift(), 'OwnerUserId']))

    # Replace NaN values in OwnerUserId column with -1
    df['OwnerUserId'].fillna(-1, inplace=True)

    # Define a function to map user IDs to reputations
    def get_user_reputation(user_id):
        if user_id == -1:
            return 0 # Or any other default value you prefer
        else:
            return users[user_id]

    # Apply the function to the OwnerUserId column
    #df['user_reputation'] = df['OwnerUserId'].apply(get_user_reputation)
    
    
    # extract the time features

    df['CreationDate'] = pd.to_datetime(df['CreationDate'], infer_datetime_format=True)
    df['CreationDate'].isnull().sum()
    df['QuestionId'].duplicated().sum()

    df['prev_post_edit_time'] = (df['CreationDate'] - df.groupby('QuestionId')['CreationDate'].shift(1)).dt.total_seconds().div(60).apply(lambda x: math.log(x) if x > 0 else 0)
    df['next_post_edit_time'] = (df.groupby('QuestionId')['CreationDate'].shift(-1) - df['CreationDate']).dt.total_seconds().div(60).apply(lambda x: math.log(x) if x > 0 else 0)
    df['prev_comment_time'] = (df['CreationDate'] - df.groupby('QuestionId')['CreationDate'].shift(1)).dt.total_seconds().div(60).apply(lambda x: math.log(x) if x > 0 else 0)
    
    # compute polarity and subjectivity of comment text
    df['polarity'] = df['Text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    df['subjectivity'] = df['Text'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
    


    # text length
    df['text_len'] = df['Text'].apply(len)

    # starts with @
    df['starts_with_@'] = df['Text'].str.startswith('@').astype(int)

    # contains question mark
    df['contains_question_mark'] = df['Text'].str.contains('\?').astype(int)

    # contains exclamation mark
    df['contains_exclamation_mark'] = df['Text'].str.contains('\!').astype(int)

    # contains but
    df['contains_but'] = df['Text'].str.contains(' but ').astype(int)

    # contains exception
    df['contains_exception'] = df['Text'].str.contains(' exception').astype(int)

    # contains URL
    url_pattern = r'http\S+|www\.\S+'
    df['contains_url'] = df['Text'].str.contains(url_pattern).astype(int)

    # contains emotions
    emoticon_pattern = r'(?::|;|=)(?:-)?(?:\)|\(|D|P)'
    df['contains_emotions'] = df['Text'].str.contains(emoticon_pattern).astype(int)

    # talks to role
    df['talks_to_role'] = 0
    df.loc[df['Text'].str.startswith('@questioner'), 'talks_to_role'] = 1
    df.loc[df['Text'].str.startswith('@answerer'), 'talks_to_role'] = 2
    df.loc[df['Text'].str.startswith('@'), 'talks_to_role'] = 3
    
    return df[['comment_score', 'comment_order', 'by_asker', 'by_answerer', 
               'by_not_seen_commenter', 'by_seen_commenter', 'prev_post_edit_time', 'next_post_edit_time', 'prev_comment_time','polarity', 'subjectivity','text_len', 'starts_with_@', 'contains_question_mark', 'contains_exclamation_mark', 'contains_but', 'contains_exception', 'contains_url','contains_emotions','talks_to_role']]


#Function call
train_df1 = extract_features(train_df)
test_df2 = extract_features(test_df)

In [60]:
#Data preprocessing for text feature extraction :

In [62]:
# Preprocess train and test text columns
vectorizer = CountVectorizer(stop_words='english', lowercase=True)
X_train = vectorizer.fit_transform(train_df['Text'])
X_test = vectorizer.transform(test_df['Text'])

# Convert categorical variable to binary variable
y_train = train_df['need_update'].apply(lambda x: 1 if x == 'update' else 0)
y_test = test_df['need_update'].apply(lambda x: 1 if x == 'update' else 0)

# Extract features from train and test datasets
train_features = train_df1[['comment_score', 'comment_order', 'by_asker', 'by_answerer','by_not_seen_commenter', 'by_seen_commenter', 'prev_post_edit_time', 'next_post_edit_time', 'prev_comment_time','polarity', 'subjectivity','text_len', 'starts_with_@', 'contains_question_mark', 'contains_exclamation_mark', 'contains_but', 'contains_exception', 'contains_url','contains_emotions','talks_to_role']]

test_features = test_df2[['comment_score', 'comment_order', 'by_asker', 'by_answerer', 'by_not_seen_commenter', 'by_seen_commenter', 'prev_post_edit_time', 'next_post_edit_time', 'prev_comment_time','polarity', 'subjectivity','text_len', 'starts_with_@', 'contains_question_mark', 'contains_exclamation_mark', 'contains_but', 'contains_exception', 'contains_url','contains_emotions','talks_to_role']]


In [63]:
X_train = pd.concat([pd.DataFrame(X_train.toarray()), train_features.reset_index(drop=True)], axis=1)
X_test = pd.concat([pd.DataFrame(X_test.toarray()), test_features.reset_index(drop=True)], axis=1)
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)


In [64]:
#Model buiding for feature extraction of text

In [65]:
#Logistic Regression (Features)
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Testing  model
y_pred = lr.predict(X_test)

# Evaluate  model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Logistic Regression Accuracy (only features): {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 score: {f1:.4f}')

Logistic Regression Accuracy (only features): 0.5799, Precision: 0.7252, Recall: 0.3011, F1 score: 0.4255


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [66]:
#Random Forest (Features)
rm = RandomForestClassifier()
rm.fit(X_train, y_train)

# Testing  model
y_pred = rm.predict(X_test)

# Evaluate logistic regression model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Random Forest Accuracy(only features): {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 score: {f1:.4f}')

Random Forest Accuracy(only features): 0.5807, Precision: 0.7333, Recall: 0.2964, F1 score: 0.4221


In [67]:
#Guassian Bayes (Features)
nb = GaussianNB()
nb.fit(X_train, y_train)

# Testing  model
y_pred = nb.predict(X_test)

# Evaluate  model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f' Guassian Bayes Accuracy(only features): {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 score: {f1:.4f}')

 Guassian Bayes Accuracy(only features): 0.4889, Precision: 0.5063, Recall: 0.4485, F1 score: 0.4756


In [68]:
#data preprocessing for TF-IDF Extraction

In [69]:
# Preprocess train and test text columns
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df['Text'])
X_test = vectorizer.transform(test_df['Text'])

# Convert categorical variable to binary variable
y_train = train_df['need_update'].apply(lambda x: 1 if x == 'update' else 0)
y_test = test_df['need_update'].apply(lambda x: 1 if x == 'update' else 0)



In [70]:
#Model buiding for tf-idf

In [71]:
#Logistic Regression (TF-IDF)
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Testing  model
y_pred = lr.predict(X_test)

# Evaluate  model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Logistic Regression Accuracy (TF-IDF): {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 score: {f1:.4f}')

Logistic Regression Accuracy (TF-IDF): 0.4898, Precision: 0.5250, Recall: 0.1331, F1 score: 0.2124


In [72]:
#TF-IDF Extraction (TF-IDF)
rm = RandomForestClassifier()
rm.fit(X_train, y_train)

# Testing  model
y_pred = rm.predict(X_test)

# Evaluate logistic regression model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Random Forest Accuracy(TF-IDF): {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 score: {f1:.4f}')

Random Forest Accuracy(TF-IDF): 0.5045, Precision: 0.5586, Recall: 0.1965, F1 score: 0.2907


In [74]:
#Guassian Bayes (TF-IDF)

X_train = X_train.toarray()
X_test= X_test.toarray()

nb = GaussianNB()
nb.fit(X_train, y_train)

# Testing  model
y_pred = nb.predict(X_test)

# Evaluate  model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy Guassian Bayes Accuracy(TF-IDF): {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 score: {f1:.4f}')

Accuracy Guassian Bayes Accuracy(TF-IDF): 0.4930, Precision: 0.5113, Recall: 0.4295, F1 score: 0.4668


In [75]:
#data preprocessing for features+tf-idf extraction

In [77]:
# Preprocess train and test text columns
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
X_train = vectorizer.fit_transform(train_df['Text'])
X_test = vectorizer.transform(test_df['Text'])

# Convert categorical variable to binary variable
y_train = train_df['need_update'].apply(lambda x: 1 if x == 'update' else 0)
y_test = test_df['need_update'].apply(lambda x: 1 if x == 'update' else 0)

# Extract features from train and test datasets
train_features = train_df1[['comment_score', 'comment_order', 'by_asker', 'by_answerer','by_not_seen_commenter', 'by_seen_commenter', 'prev_post_edit_time', 'next_post_edit_time', 'prev_comment_time','polarity', 'subjectivity','text_len', 'starts_with_@', 'contains_question_mark', 'contains_exclamation_mark', 'contains_but', 'contains_exception', 'contains_url','contains_emotions','talks_to_role']]

test_features = test_df2[['comment_score', 'comment_order', 'by_asker', 'by_answerer', 'by_not_seen_commenter', 'by_seen_commenter', 'prev_post_edit_time', 'next_post_edit_time', 'prev_comment_time','polarity', 'subjectivity','text_len', 'starts_with_@', 'contains_question_mark', 'contains_exclamation_mark', 'contains_but', 'contains_exception', 'contains_url','contains_emotions','talks_to_role']]

X_train = pd.concat([pd.DataFrame(X_train.toarray()), train_features.reset_index(drop=True)], axis=1)
X_test = pd.concat([pd.DataFrame(X_test.toarray()), test_features.reset_index(drop=True)], axis=1)
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [78]:
#Model buiding for features + tf-idf

In [79]:
#Logistic Regression (Features+tf-idf)
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Testing  model
y_pred = lr.predict(X_test)

# Evaluate  model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Logistic Regression Accuracy (tf-idf + features): {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 score: {f1:.4f}')

Logistic Regression Accuracy (tf-idf + features): 0.5889, Precision: 0.7345, Recall: 0.3201, F1 score: 0.4459


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [80]:
#Random Forest (Features+tf-idf)
rm = RandomForestClassifier()
rm.fit(X_train, y_train)

# Testing  model
y_pred = rm.predict(X_test)

# Evaluate  model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Random Forest Accuracy (tf-idf + features): {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 score: {f1:.4f}')

Random Forest Accuracy (tf-idf + features): 0.5864, Precision: 0.7351, Recall: 0.3122, F1 score: 0.4383


In [81]:
#Guassian NB (Features+tf-idf)
nb = GaussianNB()
nb.fit(X_train, y_train)

# Testing  model
y_pred = nb.predict(X_test)

# Evaluate  model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Guassian NB Accuracy (tf-idf + features): {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 score: {f1:.4f}')

Guassian NB Accuracy (tf-idf + features): 0.5004, Precision: 0.5214, Recall: 0.4057, F1 score: 0.4563


In [82]:
#Deep learning CNN Model

In [83]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_df['Text'])
X_train = tokenizer.texts_to_sequences(train_df['Text'])
X_test = tokenizer.texts_to_sequences(test_df['Text'])

# Pad the sequences to a fixed length
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


In [84]:
#Model building
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=maxlen))
model.add(Conv1D(32, kernel_size=3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [85]:
#Evaluation of model
model.fit(X_train, y_train, epochs=19, batch_size=64)
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

Epoch 1/19
Epoch 2/19
Epoch 3/19
Epoch 4/19
Epoch 5/19
Epoch 6/19
Epoch 7/19
Epoch 8/19
Epoch 9/19
Epoch 10/19
Epoch 11/19
Epoch 12/19
Epoch 13/19
Epoch 14/19
Epoch 15/19
Epoch 16/19
Epoch 17/19
Epoch 18/19
Epoch 19/19
Accuracy: 50.61
