In [None]:
############################################################################################
############################################################################################
   ###################### Desk Reject Project: Creating the model
############################################################################################
############################################################################################

#################################################
  ###### SQL request to get data; then put data in pandas df.
#################################################
############################
  #### Make a connection to the EM data source and execute the query:
############################
import pymssql
import pandas as pd
conn = pymssql.connect(server='*WITHHELD*', user='*WITHHELD*', password='*WITHHELD*', database='*WITHHELD*')
    # NOTE: database='PONE' is not needed but seems to speed up the query.

####### Make a call on the source with a specific SQL querry
cur = conn.cursor(as_dict=True)     # 'as_dict' allow you to transform the query into a pandas df
cur.execute(
    """
    SELECT d.PUBDNUMBER, d.DOCUMENTID, d.DTITLE, d.SHORT_TITLE, d.ABSTRACT_TEXT, d.ORIGINAL_SUBMISSION_START_DATE,
        d.CATEGORY, d.ENTER_KEYWORDS, d.REQUEST_EDITOR, d.SUGGEST_REVIEWERS, d.OPPOSE_REVIEWERS,
        a.DEPARTMENT, a.INSTITUTE, a.COUNTRY, a.PTITLE, a.POSITION, a.ALLAUTHORS, 
        a.ALL_AUTHOR_CONTRIBUTOR_ROLES, a.SECTIONNAME, a.FUNDER_NAME
    FROM pone.dbo.DOCUMENT AS d
    LEFT JOIN pone.dbo.ROLEAUTH_DOC_PEOPLE_ADDR AS a on d.DOCUMENTID = a.DOCUMENTID
    WHERE d.ORIGINAL_SUBMISSION_START_DATE >= '2012-01-10 00:00:00.000' AND d.ORIGINAL_SUBMISSION_START_DATE <= '2020-03-31 00:00:00.000' AND d.PUBDNUMBER IS NOT NULL
    """)


############################
  #### Put the data into a pandas df
############################
data = []
for i in range(9000000):
    if cur.fetchone() is None:
        break
    else:
        row = cur.fetchone()
        data.append(row)

data = pd.DataFrame(data[0:len(data)-1])


############################
  #### De-duplicate DOCUMENTID, taking only the 1st row
############################
data = data.drop_duplicates(subset=['DOCUMENTID'], keep= 'first')


############################
  #### Import the list of MS numbers that are desk rejects and make dv in main data (1 = desk reject 0 = not):
############################
import numpy as np
dv = pd.read_csv("All desk rejects since 2012.csv")
data = pd.merge(data, dv[['ManuscriptNumber', 'Date_Submit']], how='left', left_on = ['PUBDNUMBER'], right_on = ['ManuscriptNumber'])
data['desk_reject'] = np.where(data[['Date_Submit']].isnull(), 0, 1)
#### Check it worked:
#data[data['Date_Submit'].notnull()]


############################
  #### Drop unecessary column from dv:
############################
data = data.drop(['ManuscriptNumber'], axis=1)


############################
  #### You can subset the data by year if you'd like here (e.g. compare model for only after 2016):
############################
################################################################## DISABLED
####### Create date only variable from ORIGINAL_SUBMISSION_START_DATE:
#from datetime import datetime
#temp = []
#for i in range(len(data)):
#    a = datetime.strptime(data['ORIGINAL_SUBMISSION_START_DATE'][i].split(' ')[0], '%Y-%m-%d')
#    temp.append(a)
#data['date'] = temp

#data = data[data['date'] > '2016-01-01'].reset_index(drop = True)



In [None]:
############################
  #### Save a copy of your data so you don't have to re-get the SQL:
############################
saved_copy_of_data = data.copy()

############################
  #### Load the copy of your saved initial data:
############################
#data = saved_copy_of_data

############################
  #### Subset the data by year (e.g. compare model for only after 2017):
############################
####### Create date only variable from ORIGINAL_SUBMISSION_START_DATE:
#from datetime import datetime
#temp = []
#for i in range(len(data)):
#    a = datetime.strptime(str(data['ORIGINAL_SUBMISSION_START_DATE'][i]).split(' ')[0], '%Y-%m-%d')
#    temp.append(a)
#data['date'] = temp

#data = data[data['date'] > '2017-01-01'].reset_index(drop = True)
#data = data[data['date'] < '2020-04-01'].reset_index(drop = True)



## Create and then parse down free-text variables

In [None]:
#################################################
  ###### USING COUNT VECTOR TO CREATE WORD/BI & TRI GRAM FREQUENCIES:
    # NOTE: Counts are better than TF-IDFs in this case: Same model quality metrics but tf-idfs require re-uploading all other values, word counts do not
       # See supplmental code: tf-idfs may be better for other projects.
#################################################
#####################
 # Create a sparse matrix of all the tf-idfs. Variable by variable for the following variables:
  ## DTITLE
  ## SHORT_TITLE
  ## ABSTRACT_TEXT
  ## DEPARTMENT
  ## INSTITUTE
  ## PTITLE
  ## ALL_AUTHOR_CONTRIBUTOR_ROLES
  ## FUNDER_NAME
#####################
###############
 # The following is a count vectorizer: This gives a frequency of words, bigrams and trigrams (with ngram_range as 1,3)
    # ALSO: a word or bi-gram or tri-gram in less than 2 docs or more than 80% of docs is removed).
    # INCLUDES stopwords
###############
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_df=0.8, min_df=2, ngram_range=(1, 3))
doc_term_matrix = vect.fit_transform(data['DTITLE'].astype(str))
vect_2 = CountVectorizer(max_df=0.8, min_df=2, ngram_range=(1, 3))
doc_term_matrix_2 = vect_2.fit_transform(data['SHORT_TITLE'].astype(str))
vect_3 = CountVectorizer(max_df=0.8, min_df=2, ngram_range=(1, 3))
doc_term_matrix_3 = vect_3.fit_transform(data['ABSTRACT_TEXT'].astype(str))
vect_4 = CountVectorizer(max_df=0.8, min_df=2, ngram_range=(1, 3))
doc_term_matrix_4 = vect_4.fit_transform(data['DEPARTMENT'].astype(str))
vect_5 = CountVectorizer(max_df=0.8, min_df=2, ngram_range=(1, 3))
doc_term_matrix_5 = vect_5.fit_transform(data['INSTITUTE'].astype(str))
vect_6 = CountVectorizer(max_df=0.8, min_df=2, ngram_range=(1, 3))
doc_term_matrix_6 = vect_6.fit_transform(data['PTITLE'].astype(str))
vect_7 = CountVectorizer(max_df=0.8, min_df=2, ngram_range=(1, 3))
doc_term_matrix_7 = vect_7.fit_transform(data['ALL_AUTHOR_CONTRIBUTOR_ROLES'].astype(str))
vect_8 = CountVectorizer(max_df=0.8, min_df=2, ngram_range=(1, 3))
doc_term_matrix_8 = vect_8.fit_transform(data['FUNDER_NAME'].astype(str))




## Create non-free text variables:

In [None]:
#################################################
  ###### Create variables: Make into dummy variables
#################################################
############################
  #### Not_filled then drop:
############################
####### POSITION
import numpy as np
data['POSITION_not_filled'] = np.where((data['POSITION'].isnull()) | (data['POSITION'] == " ") | (data['POSITION'] == ""), 1, 0)
data = data.drop('POSITION', axis=1)
####### FUNDER_NAME: Turn into 'anything filled in vs. not'
data['FUNDER_NAME_not_filled'] = np.where((data['FUNDER_NAME'].isnull()) | (data['FUNDER_NAME'] == " ") | (data['FUNDER_NAME'] == ""), 1, 0)

data = data.drop('FUNDER_NAME', axis=1)


############################
  #### CATEGORY: DUE TO LOW NUMBERS ON MANY OF THEM, I ONLY USED:
      # 'Research Article'
      # 'Research Articles'
      # Either 'Research Article' or 'Research Articles'
############################
data['CATEGORY_clinical_trial'] = np.where((data['CATEGORY'] == 'Clinical Trial') | (data['CATEGORY'] == 'Clinical trial'), 1, 0)
data['CATEGORY_Research_Article'] = np.where((data['CATEGORY'] == 'Research Article'), 1, 0)
data['CATEGORY_Research_Articles'] = np.where((data['CATEGORY'] == 'Research Articles'), 1, 0)
data['CATEGORY_RA_or_RAs'] = np.where((data['CATEGORY'] == 'Research Article') | (data['CATEGORY'] == 'Research Articles'), 1, 0)


############################
  #### SECTIONNAME: Create 20 dummy variables OR create specific ones; Also create a 'not_filled'
############################
data['SECTIONNAME_not_filled'] = np.where((data['SECTIONNAME'].isnull()) | (data['SECTIONNAME'] == " ") | (data['SECTIONNAME'] == ""), 1, 0)
#data = pd.get_dummies(data, columns=['SECTIONNAME'])

####### Only using specific SECTIONNAMES with higher/lower importance:
data['SECTIONNAME_Life & Social Sciences'] = np.where((data['SECTIONNAME'] == 'Life & Social Sciences'), 1, 0)
data['SECTIONNAME_Life Sciences'] = np.where((data['SECTIONNAME'] == 'Life Sciences'), 1, 0)
data['SECTIONNAME_Other'] = np.where((data['SECTIONNAME'] == 'Other'), 1, 0)
data['SECTIONNAME_Medicine and Health Sciences'] = np.where((data['SECTIONNAME'] == 'Medicine and Health Sciences'), 1, 0)
data['SECTIONNAME_SECTIONNAME_Medicine and public health'] = np.where((data['SECTIONNAME'] == 'SECTIONNAME_Medicine and public health'), 1, 0)
data['SECTIONNAME_Environmental Sciences'] = np.where((data['SECTIONNAME'] == 'Environmental Sciences'), 1, 0)
data['SECTIONNAME_Clinical'] = np.where((data['SECTIONNAME'] == 'Clinical'), 1, 0)
data['SECTIONNAME_Physical sciences and engineering'] = np.where((data['SECTIONNAME'] == 'Physical sciences and engineering'), 1, 0)
data['SECTIONNAME_Clinical Sciences'] = np.where((data['SECTIONNAME'] == 'Clinical Sciences'), 1, 0)
data['SECTIONNAME_Applied Mathematics'] = np.where((data['SECTIONNAME'] == 'Applied Mathematics'), 1, 0)
data['SECTIONNAME_Earth Sciences'] = np.where((data['SECTIONNAME'] == 'Earth Sciences'), 1, 0)
data['SECTIONNAME_Social and Behavioral Sciences'] = np.where((data['SECTIONNAME'] == 'Social and Behavioral Sciences'), 1, 0)




############################
  #### COUNTRY: 'anything filled in vs. not' ... Dummy variables for these: PAKISTAN, ETHIOPIA, INDIA, CHINA, UNITED STATES
    # NOTE: You could also try out EVERY country: just use the disabled code
############################
####### Using ALL countries as separate dummy variables:
#data['COUNTRY_not_filled'] = np.where(data[['COUNTRY']].isnull(), 1, 0)
#data = pd.get_dummies(data, columns=['COUNTRY'])

####### Only using specific countries with higher/lower importance::
data['COUNTRY_not_filled'] = np.where(data[['COUNTRY']].isnull(), 1, 0)

data['COUNTRY_AUSTRALIA'] = np.where((data['COUNTRY'] == 'AUSTRALIA'), 1, 0)
data['COUNTRY_BANGLADESH'] = np.where((data['COUNTRY'] == 'BANGLADESH'), 1, 0)
data['COUNTRY_CANADA'] = np.where((data['COUNTRY'] == 'CANADA'), 1, 0)
data['COUNTRY_CHINA'] = np.where((data['COUNTRY'] == 'CHINA'), 1, 0)
data['COUNTRY_EGYPT'] = np.where((data['COUNTRY'] == 'EGYPT'), 1, 0)
data['COUNTRY_ETHIOPIA'] = np.where((data['COUNTRY'] == 'ETHIOPIA'), 1, 0)
data['COUNTRY_GERMANY'] = np.where((data['COUNTRY'] == 'GERMANY'), 1, 0)
data['COUNTRY_INDIA'] = np.where((data['COUNTRY'] == 'INDIA'), 1, 0)
data['COUNTRY_IRAN, ISLAMIC REPUBLIC OF'] = np.where((data['COUNTRY'] == 'IRAN, ISLAMIC REPUBLIC OF'), 1, 0)
data['COUNTRY_MALAYSIA'] = np.where((data['COUNTRY'] == 'MALAYSIA'), 1, 0)
data['COUNTRY_NETHERLANDS'] = np.where((data['COUNTRY'] == 'NETHERLANDS'), 1, 0)
data['COUNTRY_NIGERIA'] = np.where((data['COUNTRY'] == 'NIGERIA'), 1, 0)
data['COUNTRY_PAKISTAN'] = np.where((data['COUNTRY'] == 'PAKISTAN'), 1, 0)
data['COUNTRY_SAUDI ARABIA'] = np.where((data['COUNTRY'] == 'SAUDI ARABIA'), 1, 0)
data['COUNTRY_UNITED KINGDOM'] = np.where((data['COUNTRY'] == 'UNITED KINGDOM'), 1, 0)
data['COUNTRY_UNITED STATES'] = np.where((data['COUNTRY'] == 'UNITED STATES'), 1, 0)




#################################################
  ###### Create variables: Varied:
#################################################
############################
  #### DEPARTMENT: (a) Has only one word (b) not filled in
############################
### One word:
f = data[data['DEPARTMENT'].astype(str).str.contains(" ")]
data['DEPARTMENT_over_one_word'] = np.where(data.index.isin(f.index), 1, 0)
### Not filled in:
data['DEPARTMENT_not_filled'] = np.where((data['DEPARTMENT'].isnull()) | (data['DEPARTMENT'] == ""), 1, 0)

####### Create a ratio variable of the number of Characters (ratio):
temp = []
for i in range(len(data)):
    a = len(str(data['DEPARTMENT'][i]))
    temp.append(a)
data['DEPARTMENT_num_characters'] = temp



############################
  #### ALLAUTHORS: Count the number of semicolons + 1. Use as ratio but also turn into a binned (dummy) variable
############################
####### Create a ratio variable of the number of authors (minus 1):
temp = []
for i in range(len(data)):
    a = str(data['ALLAUTHORS'][i]).count(';')
    temp.append(a)
data['ALLAUTHORS_ratio_count'] = temp

####### Some have a ton of authors so bin instead: 1; 2-3; 4-5; 6-7; 8-10; 11-14; 15-20; 21-35; 35+
############# ONLY USE 2-3 AND RATIO
data['ALLAUTHORS_1'] = np.where((data['ALLAUTHORS_ratio_count'] == 1), 1, 0)
data['ALLAUTHORS_2_3'] = np.where((data['ALLAUTHORS_ratio_count'] == 2) | (data['ALLAUTHORS_ratio_count'] == 3), 1, 0)


############################
  #### INSTITUTE: Anything filled in vs. not' THEN number of characters (ratio) THEN number of words (ratio) THEN Words over 4 THEN words over 6
############################
####### Create variable: INSTITUTE not filled in:
data['INSTITUTE_not_filled'] = np.where((data['INSTITUTE'].isnull()) | (data['INSTITUTE'] == " ") | (data['INSTITUTE'] == ""), 1, 0)

####### Create a ratio variable of the number of Characters (ratio):
temp = []
for i in range(len(data)):
    a = len(str(data['INSTITUTE'][i]))
    temp.append(a)
data['INSTITUTE_num_characters'] = temp


############################
  #### ORIGINAL_SUBMISSION_START_DATE: Dummy variables for each day of the week; Dummy variable for each month:
############################
####### Create date only variable from ORIGINAL_SUBMISSION_START_DATE:
from datetime import datetime
temp = []
for i in range(len(data)):
    a = datetime.strptime(str(data['ORIGINAL_SUBMISSION_START_DATE'][i]).split(' ')[0], '%Y-%m-%d')
    temp.append(a)
data['date'] = temp
data.tail()

####### Create variables for each day of the week:
  # NOTE: Monday is 0 and Sunday is 6.
temp = []
for i in range(len(data)):
    a = data['date'][i].weekday()
    temp.append(a)
data['dayofweek'] = temp
#### Make dummy variables:
data['dayofweek_mon'] = np.where((data['dayofweek'] == 0), 1, 0)
data['dayofweek_tue'] = np.where((data['dayofweek'] == 1), 1, 0)
data['dayofweek_wed'] = np.where((data['dayofweek'] == 2), 1, 0)
data['dayofweek_thur'] = np.where((data['dayofweek'] == 3), 1, 0)
data['dayofweek_fri'] = np.where((data['dayofweek'] == 4), 1, 0)
data['dayofweek_sat'] = np.where((data['dayofweek'] == 5), 1, 0)
data['dayofweek_sun'] = np.where((data['dayofweek'] == 6), 1, 0)

data = data.drop('dayofweek', axis=1)


####### Create dummy variables for each month:
temp = []
for i in range(len(data)):
    a = data['date'][i].month
    temp.append(a)
data['month'] = temp

data['month_1'] = np.where((data['month'] == 1), 1, 0)
data['month_2'] = np.where((data['month'] == 2), 1, 0)
data['month_3'] = np.where((data['month'] == 3), 1, 0)
data['month_4'] = np.where((data['month'] == 4), 1, 0)
data['month_5'] = np.where((data['month'] == 5), 1, 0)
data['month_6'] = np.where((data['month'] == 6), 1, 0)
data['month_7'] = np.where((data['month'] == 7), 1, 0)
data['month_8'] = np.where((data['month'] == 8), 1, 0)
data['month_9'] = np.where((data['month'] == 9), 1, 0)
data['month_10'] = np.where((data['month'] == 10), 1, 0)
data['month_11'] = np.where((data['month'] == 11), 1, 0)
data['month_12'] = np.where((data['month'] == 12), 1, 0)

data = data.drop('month', axis=1)





#################################################
  ###### Combine pandas data of non-free text into the main I.V. list: the 'X_list' sparse matrix)
#################################################
############################
  #### Accumulate all pandas columns to include in analysis:
############################
####### Make sure to keep these 2 pre-created variables:
data['REQUEST_EDITOR_2'] = np.where((data['REQUEST_EDITOR'] == 1), 1, 0)
data['OPPOSE_REVIEWERS_2'] = np.where((data['OPPOSE_REVIEWERS'] == 1), 1, 0)

##### The 1st variable to include is "POSITION_not_filled", get that position until the end
   # Remove 'date' column too
data = data.drop(['date'], axis=1)
start_col = data.columns.get_loc('POSITION_not_filled')
for_x_list = data.iloc[:, start_col : len(data.columns)]





In [None]:
#################################################
  ###### Combing ALL free text words and ngrams with the non-free text variables
#################################################
#####################
 # Add the sparse matracies together as one variable list: https://stackoverflow.com/questions/19710602/concatenate-sparse-matrices-in-python-using-scipy-numpy
     # This 'X_list' will be you main list of all independent variables
#####################
####### Combining the sparse matricies:
from scipy.sparse import hstack
X_list = hstack((doc_term_matrix, doc_term_matrix_2, doc_term_matrix_3 ,doc_term_matrix_4, doc_term_matrix_5, doc_term_matrix_6, doc_term_matrix_7, doc_term_matrix_8))


#####################
 # Combine the feature names AND which variable they come from so you know what is what:
#####################
aa = pd.DataFrame(vect.get_feature_names())
aa['main_var'] = 'DTITLE'
bb = pd.DataFrame(vect_2.get_feature_names())
bb['main_var'] = 'SHORT_TITLE'
cc = pd.DataFrame(vect_3.get_feature_names())
cc['main_var'] = 'ABSTRACT_TEXT'
dd = pd.DataFrame(vect_4.get_feature_names())
dd['main_var'] = 'DEPARTMENT'
ee = pd.DataFrame(vect_5.get_feature_names())
ee['main_var'] = 'INSTITUTE'
ff = pd.DataFrame(vect_6.get_feature_names())
ff['main_var'] = 'PTITLE'
gg = pd.DataFrame(vect_7.get_feature_names())
gg['main_var'] = 'ALL_AUTHOR_CONTRIBUTOR_ROLES'
hh = pd.DataFrame(vect_8.get_feature_names())
hh['main_var'] = 'FUNDER_NAME'

####### Combining all the feature names
full_features = pd.concat([aa, bb, cc, dd, ee, ff, gg, hh], ignore_index=True, sort=False)



############################
  #### Add pandas columns into the sparse matrix:
############################
X_list = hstack((X_list, np.array(for_x_list)))
   # NOTE: category must be an integer not a float.


############################
  #### Add on the pandas column names to the 'full_features' list of feature names
############################
for_x_list = pd.DataFrame(list(for_x_list.columns))
for_x_list['main_var'] = 'non_text'
full_features = pd.concat([pd.DataFrame(full_features), for_x_list], ignore_index=True, sort=False)





## The following cells:
  ## A: Create a test/training data and uses SMOTE to correct for imbalanced classification
  ## B: Fit a RandomForrest or GradientBoostingClassifier model
  ## C: Print out model metrics.

In [None]:
############################################################################################
############################################################################################
   #################### Desk Reject Project: Modelling 'desk_rejects' with decision trees
############################################################################################
############################################################################################
#################################################
  ###### Set up the test/training data:
#################################################
############################
  #### Set up the test/training data:
############################
####### Separate the DV (desk_reject) and all other variables:
from sklearn.model_selection import train_test_split
y = pd.Series(data['desk_reject'].astype('int64'))
X = X_list.copy()

####### Create the test and train data:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)



############################
  #### To do train/test/validaion, do a 2nd split: (DISABLED FOR NOW)
############################
### If the first test/train is 0.20, then the following would be: 60% train, 20% test, 20% validation:
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25) # 0.25 x 0.8 = 0.2





#################################################
  ###### Deal with imbalenced classification: SMOTE
#################################################
############################
  #### Deal with imbalenced classification: SMOTE
    # See the following for examples of SMOTE: https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
    # Either use this or pure duplication method.
    ## NOTE: Only use SMOTE on the TRAINING data, not the test data. e.g.: https://datascience.stackexchange.com/questions/47228/using-smote-for-synthetic-data-generation-to-improve-performance-on-unbalanced-d
  #### NOTE: The other options is pure duplication: just duplicate the desk reject == 1 rows until it's even
############################
####### Sampling on desk_reject == 1 to balance classes:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train, y_train = SMOTE().fit_resample(X_train, y_train)




In [None]:
#################################################
  ###### Different types of classifiers to use:
    # Probably use RandomForestClassifier but check the other models to see what works best for your purposes
#################################################
############################
  #### Using RandomForestClassifier
############################
####### Create the classifier:
from sklearn.ensemble import RandomForestClassifier
#clf=RandomForestClassifier(n_estimators=500) #TRY: max_features = (len(X.columns)-1)
    # NOTE: Due to SMOTE, class_weight = 'balanced' not needed
#clf.fit(X_train,y_train)
####### Get predicted values:
#y_pred=clf.predict(X_test)


############################
  #### Using: GradientBoostingClassifier
    # NOTE: This one can take a very long time (over a day)
############################
####### Create the classifier:
from sklearn.ensemble import GradientBoostingClassifier
#clf=GradientBoostingClassifier()
    ### NOTE: Automatically balances the classes: e.g. answer 2 here: https://stackoverflow.com/questions/35539937/is-there-class-weight-or-alternative-way-for-gradientboostingclassifier-in-skl
#clf.fit(X_train,y_train)
####### Get predicted values:
#y_pred=clf.predict(X_test)


############################
  #### Using: GradientBoostingClassifier WITH xgboost:
    # NOTE: Explaination of xgboost: https://towardsdatascience.com/the-intuition-behind-gradient-boosting-xgboost-6d5eac844920  
  #### Parameters to try out ... e.g see: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
    # n_estimators increase the number of estimators from default of 100. Use higher number when you have a ton of features (250/1000/5000 etc)
    # parameters to try: 
      # eta default is 0.3 (try 0.1 or 0.2): Lower preforms worse
      # min_child_weight default is 1 (try 2 or 3): Neither is better, just use default.
      # max_depth default is 6 (try 5 or 7/8/9)
      # gamma default is 0 (try 0.1/0.2): lower performs worse
      # subsample default is 1 (try 0.6 or 0.8) Lower preforms worse
      # colsample_bytree default is 1 (try 0.6 or 0.8)    ************************* TRY THIS AGAIN (0.8 PERFORMED BETTER)
############################
####### Create the classifier:
from xgboost import XGBClassifier
#clf = XGBClassifier()
clf = XGBClassifier(colsample_bytree = 0.8)    # Using all the defaults and colsample_bytree = 0.8 turned out to be the best.
    ### n_estimators increase the number of estimators from default of 100 due to large number of features.
    ### NOTE: Automatically balances the classes: e.g. answer 2 here: https://stackoverflow.com/questions/35539937/is-there-class-weight-or-alternative-way-for-gradientboostingclassifier-in-skl
clf.fit(X_train,y_train)
####### Get predicted values:
y_pred=clf.predict(X_test)







#################################################
  ###### Do several types of accuracy checks:
#################################################
 # precision: lower means it's more likely to get guess something is 0 and then that guess will be wrong.
 # recall: lower means it's more likely to NOT guess something is 0 when it actually is 0.
    # So the recall the desk rejects is quite low.
 # f-1 score is an overall score of precision and recall.
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_pred), "\n")
print(classification_report(y_test, y_pred), "\n")
print(accuracy_score(y_test, y_pred))





In [None]:
#################################################
  ###### Examine feature importance scores:
#################################################
####### Print out the features scores in order of score and their column position
a = pd.DataFrame(clf.feature_importances_)
a.rename(columns={ a.columns[0]: "feature_score" }, inplace = True)
#a['variable_names'] = list(X.columns)
a = a.sort_values(by='feature_score', ascending=False)
print(a[0:30])
a[0:250].to_csv(r"feature_importance_raw.csv")                                          # NEW (remove)

####### Print out the names of the TF-IDFs:
to_save = []
for i in list(a.index[0:250]):
    print(full_features['main_var'][i], " :    ", full_features[0][i])
    aa = full_features['main_var'][i], " :    ", full_features[0][i]
    to_save.append(aa)   # NEW (remove)
to_save = pd.DataFrame(to_save)                                                         # NEW (remove)
to_save.to_csv(r"feature_importance_var_names.csv")                                     # NEW (remove)



In [None]:
#################################################
  ###### Analyze accuracy metrics after changing the threshold of your predictions:
     ## NOTE: If you go with a different threshold, you'll need to set that up in the associeted new predictions notebook
     ## NOTE: Below doesn't work for decision trees, works for RandomForest and GradientBoostingClassifier
  ###### MAIN: Changing threshold can help increase recall of desk rejects, though will decrease precision.
     # Depends on business question: do you want to flag potential desk rejects or reject papers that are for sure to be rejected.
#################################################

####### Change around the threshold to optimize the f1-score of 1:
from sklearn.metrics import f1_score, recall_score, precision_score
fone = {}
for i in range(3, 75):
    threshold = i / 100  # use this to optimize
    predicted_proba = clf.predict_proba(X_test)
    y_pred = (predicted_proba [:,1] >= threshold).astype('int')
    fone[i] = round(f1_score(y_test, y_pred),3), round(recall_score(y_test, y_pred),3), round(precision_score(y_test, y_pred),3), round(accuracy_score(y_test, y_pred),3)

####### Print out scores of thresholds:
diff_thresh = sorted(fone.items(), key=lambda x: x[1], reverse = True)
print("Threshold,  f1-Score,  Recall,  Precision score, Model Accuracy     ...  Below are sorted by f1 score")
diff_thresh



In [None]:
########### This prints out the confusion matrix for different thresholds:
predicted_proba = clf.predict_proba(X_test)
y_p = (predicted_proba [:,1] >= 0.70).astype('int')
print(confusion_matrix(y_test, y_p), "\n")

print("Percent tagged as desk reject:", sum(y_p) / len(y_p))
