# Module Import

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score,auc,roc_curve
from sklearn.model_selection import KFold,train_test_split,StratifiedShuffleSplit,StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from gensim.sklearn_api import TfIdfTransformer
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import ast
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import GridSearchCV

# Define methods

In [9]:
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)
def remove_emoji(text):
    emoji_pattern = re.compile("["
                              u"\U0001F600-\U0001F64F" #emoticons
                              u"\U0001F300-\U0001F5FF" #symbols&pics
                              u"\U0001F680-\U0001F6FF" #transportation pic
                              u"\U0001F1E0-\U0001F1FF" #flags
                              u"\U00002702-\U000027B0"
                              u"\U000024C2-\U0001F251"    
                              "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'', text)
NLTK_stop_words_list = stopwords.words('english')
custom_stop_words_list = ['...']
final_stop_words_list = NLTK_stop_words_list + custom_stop_words_list
def remove_stopwords(text):
    get_text = " ".join([word for word in str(text).split() if word not in final_stop_words_list])
    return get_text
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),.!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

# loading data

In [None]:
!git clone https://github.com/anonymoususr12/MHPurf

In [None]:
%cd MHPurf/data
project = 'pytorch' # select in [tensorflow, pytorch, keras, incubator-mxnet, caffe]
path = f'{project}.csv'
pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1,random_state=999)
pd_all['Title+Body'] = ''
for idx in range(len(pd_all)):
  if pd.notna(pd_all['Body'].iloc[idx]):
    pd_all['Title+Body'].iloc[idx] = pd_all['Title'].iloc[idx] + '. ' + pd_all['Body'].iloc[idx]
  else:
    pd_all['Title+Body'].iloc[idx] = pd_all['Title'].iloc[idx]
pd_title = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Title":"text"})
pd_title.to_csv('Title.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")
pd_body = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Body":"text"})
pd_body.to_csv('Body.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")
pd_label = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Labels":"text"})
pd_label.to_csv('Labels.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")
pd_code = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Codes":"text"})
pd_code.to_csv('Codes.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")
pd_comment = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Comments":"text"})
pd_comment.to_csv('Comments.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")
pd_command = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Commands":"text"})
pd_command.to_csv('Command.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")
pd_tplusb = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Title+Body":"text"})
pd_tplusb.to_csv('Title+Body.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")

In [10]:
# Choose the classifier
NB_FLAG = True
SVC_FLAG = False
RF_FLAG = False
MLP_FLAG = False
KNN_FLAG = False

# Set run time
REPEAT = 30

# Set output file name
out_csv_name = f'../{project}'

if NB_FLAG:
  out_csv_name += '+NB'
  params = {'var_smoothing': np.logspace(0,-9, num=20)}
elif SVC_FLAG:
  out_csv_name += '+SVC'
  params = {'C':[math.pow(2,x) for x in [-5,0,5,10]],'gamma':[1/(2*math.pow(math.pow(2,x),2)) for x in [-15,-10,-5,0,5]]}
elif RF_FLAG:
  out_csv_name += '+RF'
  params = {'n_estimators':[25,50,75,100]}
elif MLP_FLAG:
  out_csv_name += '+MLP'
  params = {'hidden_layer_sizes':[x for x in range(10,60,10)],'beta_1':[0.5]}
elif KNN_FLAG:
  out_csv_name += '+KNN'
  params = {'n_neighbors':[5, 11, 15, 21, 25, 33]}

out_csv_name += '.csv'
data = pd.read_csv('Title+Body.csv')
data = data.fillna('')
text = 'text'

original_data = data
data[text] = data[text].apply(lambda x: remove_html(x))
data[text] = data[text].apply(lambda x: remove_emoji(x))
data[text] = data[text].apply(lambda text: remove_stopwords(text))
data[text] = data[text].apply(lambda x: clean_str(x))

repeated_times = REPEAT
macro = []
micro = []
auc_value = []
for repeated_time in range(repeated_times):
  indices = np.arange(data[text].shape[0])
  train_index,test_index = train_test_split(indices, test_size=0.2, random_state=repeated_time)
  tfidf = TfidfVectorizer(ngram_range=(1,1),max_features=1000)
  X = tfidf.fit_transform(data[text])
  X_train = X[train_index].todense()
  X_test = X[test_index].todense()
  train_labels = data['sentiment'].iloc[train_index]
  test_labels = data['sentiment'].iloc[test_index]

  if NB_FLAG:
    clf = GaussianNB()
  elif SVC_FLAG:
    clf = SVC(random_state=repeated_time)
  elif RF_FLAG:
    clf = RandomForestClassifier(random_state=repeated_time)
  elif MLP_FLAG:
    clf = MLPClassifier()
  elif KNN_FLAG:
    clf = KNeighborsClassifier()

  grid = GridSearchCV(clf,params,cv=10,scoring='roc_auc')
  grid.fit(X_train, train_labels)
  optimised_clf = grid.best_estimator_
  optimised_clf.fit(X_train, train_labels)
  y_pred = optimised_clf.predict(X_test)
  y_true = test_labels
  current_macro=f1_score(y_true,y_pred,average='macro')
  macro.append(current_macro)
  current_micro=f1_score(y_true,y_pred,average='micro')
  micro.append(current_micro)
  fpr, tpr, thresholds = roc_curve(y_true,y_pred,pos_label=1)
  current_auc=auc(fpr,tpr)
  auc_value.append(current_auc)

  using = original_data[['Number','sentiment']].iloc[test_index]
  idx = 0
  # initialize list of lists
  numbers_diff = []
  ytures_diff = []
  ypreds_diff = []
  numbers_same = []
  ytures_same = []
  ypreds_same = []

  for yture,ypred in zip(y_true,y_pred):
    if yture != ypred:
      numbers_diff.append(using['Number'].iloc[idx])
      ytures_diff.append(yture)
      ypreds_diff.append(ypred)
    if yture == ypred:
      numbers_same.append(using['Number'].iloc[idx])
      ytures_same.append(yture)
      ypreds_same.append(ypred)
    idx += 1

  data_diff = {'numbers_diff':numbers_diff,'ytures_diff':ytures_diff,'ypreds_diff':ypreds_diff}
  data_same = {'numbers_same':numbers_same,'ytures_same':ytures_same,'ypreds_same':ypreds_same}
  df_diff = pd.DataFrame(data_diff)
  df_same = pd.DataFrame(data_same)
  results = pd.Series([sum(macro)/len(macro),sum(micro)/len(micro),sum(auc_value)/len(auc_value)], name='results')
  df_diff = pd.concat([df_diff,results], axis=1)
  df_same = pd.concat([df_same,results], axis=1)
  df_diff.to_csv(f'../df_diff_MLP_{project}.csv')
  df_same.to_csv(f'../df_same_MLP_{project}.csv')

new_row = pd.DataFrame({'repeated_times':[repeated_times],'cv_list':[str(auc_value)],'Macro F1': [sum(macro) / len(macro)], 'Micro F1': [sum(micro) / len(micro)], 'AUC': [sum(auc_value) / len(auc_value)]})
df_log = pd.DataFrame(columns=['repeated_times','cv_list','Macro F1', 'Micro F1', 'AUC'])
df_log = df_log.append(new_row, ignore_index=True)
df_log.to_csv(out_csv_name, mode='a', header=False)