# Import Modules

In [None]:
!pip install fasttext

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score,auc,roc_curve
from sklearn.model_selection import KFold,train_test_split,StratifiedKFold,StratifiedShuffleSplit
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import ast
from imblearn.over_sampling import RandomOverSampler,SMOTE

# Data loading

In [None]:
!git clone https://github.com/anonymoususr12/MHPurf

In [None]:
%cd MHPurf/data
project = 'tensorflow' # select in [tensorflow, pytorch, keras, incubator-mxnet, caffe]
path = f'{project}.csv'
pd_all = pd.read_csv(path)
pd_all = pd_all.sample(frac=1,random_state=999)
pd_all['Title+Body'] = ''
for idx in range(len(pd_all)):
  if pd.notna(pd_all['Body'].iloc[idx]):
    pd_all['Title+Body'].iloc[idx] = pd_all['Title'].iloc[idx] + '. ' + pd_all['Body'].iloc[idx]
  else:
    pd_all['Title+Body'].iloc[idx] = pd_all['Title'].iloc[idx]
pd_title = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Title":"text"})
pd_title.to_csv('Title.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")
pd_body = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Body":"text"})
pd_body.to_csv('Body.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")
pd_label = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Labels":"text"})
pd_label.to_csv('Labels.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")
pd_code = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Codes":"text"})
pd_code.to_csv('Codes.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")
pd_comment = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Comments":"text"})
pd_comment.to_csv('Comments.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")
pd_command = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Commands":"text"})
pd_command.to_csv('Command.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")
pd_tplusb = pd_all.rename(columns={"Unnamed: 0":"id","class":"sentiment","Title+Body":"text"})
pd_tplusb.to_csv('Title+Body.csv', index=False, columns=["id","Number","sentiment","text"], mode="w")

In [None]:
%cd all_exc
import pandas as pd
import csv
import numpy as np

AUTOTUNE_FLAG = False

REPEAT = 30
repeated_range = range(0,REPEAT)

working_path = 'Title+Body.csv'
out_csv_name = f'../fasttext_{project}'

if AUTOTUNE_FLAG == True:
  out_csv_name += '+_AUTOTUNE'

out_csv_name += '.csv'

data = pd.read_csv(working_path)
data = data.fillna('')

# remove html tag
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

data['text'] = data['text'].apply(lambda x: remove_html(x))

# remove emoji
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F" #emoticons
                               u"\U0001F300-\U0001F5FF" #symbols&pics
                               u"\U0001F680-\U0001F6FF" #transportation pic
                               u"\U0001F1E0-\U0001F1FF" #flags
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"    
                               "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'', text)
data['text'] = data['text'].apply(lambda x: remove_emoji(x))

# Stop Word Removal
NLTK_stop_words_list = stopwords.words('english')
custom_stop_words_list = ['...']
final_stop_words_list = NLTK_stop_words_list + custom_stop_words_list
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in final_stop_words_list])
data['text'] = data['text'].apply(lambda text: remove_stopwords(text))

# Symbol Removal
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),.!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

data['text'] = data['text'].apply(lambda x: clean_str(x))

import fasttext

data['sentiment'] = data['sentiment'].apply(lambda x: '__label__' + str(x))

original_body = data

macro = []
micro = []
auc_value = []
for repeated_time in repeated_range:
  indices = np.arange(data.shape[0])
  train_index,test_index = train_test_split(indices, test_size=0.2, random_state=repeated_time)

  train_data = data.iloc[train_index]
  test_data = data.iloc[test_index]
  if AUTOTUNE_FLAG:

    train2_data, valid_data = train_test_split(train_data, test_size=0.2)

    train2_data[['sentiment', 'text']].to_csv('train.txt', 
                                              index = False, 
                                              sep = ' ',
                                              header = None, 
                                              quoting = csv.QUOTE_NONE, 
                                              quotechar = "", 
                                              escapechar = " ")

    valid_data[['sentiment', 'text']].to_csv('valid.txt', 
                                              index = False, 
                                              sep = ' ',
                                              header = None, 
                                              quoting = csv.QUOTE_NONE, 
                                              quotechar = "", 
                                              escapechar = " ")

    test_data[['sentiment', 'text']].to_csv('test.txt', 
                                              index = False, 
                                              sep = ' ',
                                              header = None, 
                                              quoting = csv.QUOTE_NONE, 
                                              quotechar = "", 
                                              escapechar = " ")

    classifier = fasttext.train_supervised(input = "train.txt", autotuneValidationFile = "valid.txt")

  else:
    train_data[['sentiment', 'text']].to_csv('train.txt', 
                                              index = False, 
                                              sep = ' ',
                                              header = None, 
                                              quoting = csv.QUOTE_NONE, 
                                              quotechar = "", 
                                              escapechar = " ")
    classifier = fasttext.train_supervised(input = "train.txt")
  y_pred = []
  y_true = []
  for idx in range(len(test_data)):
    y_pred.append(int(classifier.predict(test_data['text'].iloc[idx])[0][0][9]))
    y_true.append(int(test_data['sentiment'].iloc[idx][9]))

  current_macro = f1_score(y_true, y_pred, average='macro')
  macro.append(current_macro)
  current_micro = f1_score(y_true, y_pred, average='micro')
  micro.append(current_micro)
  fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1)
  current_auc = auc(fpr, tpr)
  auc_value.append(current_auc)
                  
  new_row = {'repeated_time':repeated_time,'cv_list':str(auc_value),'Macro F1':current_macro, 'Micro F1':current_micro, 'AUC':current_auc}
  df_log = pd.DataFrame(columns=['repeated_times','cv_list','Macro F1', 'Micro F1', 'AUC'])
  df_log = df_log.append(new_row, ignore_index=True)
  df_log.to_csv(out_csv_name, mode='a', header=False)