# Models tried : Logistic Regression, Naive Bayes, XGBoost and combination of 3

# Total number of model is 4

In [None]:
import pandas as pd


In [None]:
from nltk.tokenize import word_tokenize
from sklearn import naive_bayes
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

In [None]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')


In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

In [None]:
import itertools
from autocorrect import Speller

In [None]:
def stem_post(row):
    ## Stem all words in the post
    words = row.split()  
    result = ""
    for word in words:
        result+=ps.stem(word)+' '
    return result[:-1]

In [None]:
df['stemmed_post'] = df['post'].apply(stem_post)

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

In [None]:
import xgboost as xgb
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss

In [None]:
## Custom evaluation metric for xgb
def f1_eval_mac(predt, d):
    y = d.get_label()

    predt_binary = (predt>0.5).astype(int)
    
    return "F1_score", f1_score(y_true=y, y_pred=predt_binary,average='macro')

In [None]:
## train xgb given the train_df,test_df and hyperparameters
def xgbtrain(train_df,test_df,param,count_vec):
   
 
    xtrain = count_vec.transform(train_df['stemmed_post']) 
    xtest = count_vec.transform(test_df['stemmed_post'])

    dtrain = xgb.DMatrix(xtrain,label=train_df['label'].to_numpy()) 
    dtest = xgb.DMatrix(xtest,label=test_df['label'].to_numpy())

    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    param['eval_metric']= ['auc']
    num_round = param["n_round"]
    bst = xgb.train(param, dtrain, num_round, evallist,custom_metric=f1_eval_mac)
    
    ## Score is the f1 score of this model evaluated on the given test set
    score = f1_score(np.round(bst.predict(dtest)),test_df['label'],average='macro')
    
    
    return bst,score

In [None]:
## Create countVectorizer for creating TF features

count_vec = CountVectorizer(
    tokenizer=word_tokenize,
    strip_accents="ascii", lowercase=True,
    token_pattern=None,ngram_range=(1,2)) ## Create TF features instead of TF-IDF features with unigrams and bigram
count_vec.fit(df.stemmed_post)

In [None]:
##  K fold cross validation on xgb for hyperparameter tuning
def k_fold_xgb(df,param,n_fold=10):
    skf = StratifiedKFold(n_splits=n_fold, random_state=1, shuffle=True)
    f1_score_list = []
    Y = df['label']
    bst_list = []
    for train_index,test_index in skf.split(df,Y):
        train_df,test_df = df.loc[train_index],df.loc[test_index] ## Get the train test df on current split
        bst,score = xgbtrain(train_df,test_df,param,count_vec)## train it on xgboost
        bst_list.append(bst)
        f1_score_list.append(score) 
        
    return bst_list,np.average(f1_score_list) 

In [None]:
## Default parameters
param = {'max_depth': 12, 'eta': 0.1, 'objective': 'binary:logistic',
        "subsample":0.8,"colsample_bytree":0.8,'scale_pos_weight':1.62,'alpha':0.2,
        "min_child_weight":1,"n_round":400}

In [None]:
## Test training  

## Dont have to run this cell
bst_list,avg_f1_score = k_fold_xgb(df,param,10)

In [None]:
## Run xgboost prediction with a trained xgb model and a pd dataframe containing the stemmed post
def xgb_predict(df,model,count_vec):
    xtrain = count_vec.transform(df['stemmed_post'])
    return model.predict(xgb.DMatrix(xtrain))
    

In [None]:
 ## 
def batch_xgb_predict(bst_list,df):
    temp = []
    for bst in bst_list:
        res = xgb_predict(df_test,bst,count_vec)
        temp.append(res)
    final = np.average(np.array(temp),axis=0)
    return final

In [None]:
## Logistic regression does not need hyper parameter tuning
from sklearn.ensemble import BaggingClassifier
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection

def logistic_regression(train_df):
    """ Train logistic regression model given the stemmed twitter post.
        Return: trained model and fitted CountVectorizer 
    """
    count_vec = CountVectorizer(
    tokenizer=word_tokenize,
    strip_accents="ascii", lowercase=True,
    token_pattern=None,ngram_range=(1,2))
    count_vec.fit(train_df.stemmed_post)
    model = BaggingClassifier(base_estimator=linear_model.LogisticRegression(max_iter=200),
            n_estimators=10, random_state=0)
# fit the model on training data reviews and sentiment
    xtrain = count_vec.transform(train_df['stemmed_post'])

    model.fit(xtrain, train_df.label)
    
    return model,count_vec

## Naive bayes method 
def naive_b(train_df):
    
    """ Train naive bayes model given the stemmed twitter post.
        Return: trained model and fitted CountVectorizer """
    count_vec = CountVectorizer(
    tokenizer=word_tokenize,
    strip_accents="ascii", lowercase=True,
    token_pattern=None)
    count_vec.fit(train_df.stemmed_post)
    clf = BaggingClassifier(base_estimator=naive_bayes.MultinomialNB(),
            n_estimators=10, random_state=0)
    x_train = count_vec.transform(train_df['stemmed_post'])
   
    clf.fit(x_train,train_df['label'])

    return clf,count_vec

def model_predict(df,model,count_vec):
    ## Predict given a df of stemmed post, model and a CountVectorizer that the model is trained on
    xtrain = count_vec.transform(df['stemmed_post'])
    return model.predict_proba(xtrain)[:,1]



## Average the probability predict for xgboost, logreg and naive bayes

def combine_pred(pred_list,test_df):
   
    return  f1_score(np.round(np.sum(pred_list,axis=0)/3),test_df['label'],average='macro')
    
    

In [None]:
## Train logreg model on the dataset

logreg,logreg_count_vec = logistic_regression(df)

In [None]:
## Train naive bayes model on the dataset
naive,naive_count_vec = naive_b(df)

In [None]:
## predictions for logreg model
## Used for voting in the ensembled model later 
pred1 = model_predict(df_test,logreg,logreg_count_vec)

In [None]:
## predictions for naive bayes model

## Used for voting in the ensembled model later 
pred2 = model_predict(df_test,naive,naive_count_vec)

# Using wandb for hyperparameter tuning

In [None]:
# Wandb is an experiment tracking tool for machine learning

## Need to pip install wandb for hyperparameter tuning 


import wandb
wandb.login()

In [None]:
sweep_config = {
    "method": "grid", # try grid or random
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "max_depth": {
            "values": [12,15]
        },
        "learning_rate": {
            "values": [0.1]
        },
        "subsample": {
            "values": [0.8]
        },
        "colsample_bytree": {
            "values": [0.8]
        },
        "alpha": {
            "values": [1, 0.5, 0.1,0]
        },
        "gamma": {
            "values": [1, 0.1,0.3,0]
        },
        "n_estimators": {
            "values": [400]
        },
        "lambd": {
            "values": [0,0.1,0.05,1]
        }
        
    }
}

In [None]:
sweep_id = wandb.sweep(sweep_config, project="XGBoost-sweeps12")

In [None]:
def train():
    config_defaults = {
    "booster": "gbtree",
    "max_depth": 3,
    "learning_rate": 0.1,
    "subsample": 1,
    "seed": 117,
    "test_size": 0.33,
      }

    wandb.init(config=config_defaults)  # defaults are over-ridden during the sweep
    config = wandb.config

    ## set xgb param from sweep config
    param = {"max_depth":config.max_depth,"eta":config.learning_rate,
             'objective': 'binary:logistic',
            "subsample":config.subsample,"colsample_bytree":config.colsample_bytree,
             "alpha":config.alpha,"gamma":config.gamma,'scale_pos_weight':1.62,"n_round":config.n_estimators,"lambda":config.lambd
            }
    
    bst_list,avg_f1_score = k_fold_xgb(df,param,10)
   
   
    print(f"f1_score: {int(avg_f1_score * 100.)}%")
    wandb.log({"f1_score": avg_f1_score})
    

In [None]:
wandb.agent(sweep_id, train, count=150)

In [None]:
wandb.finish()

In [None]:
## Best parameters found using grid search
param = {'max_depth': 12, 'eta': 0.1, 'objective': 'binary:logistic',
        "subsample":0.8,"colsample_bytree":0.8,'scale_pos_weight':1.62,'alpha':0.1,"gamma":0.3,"lambda":1,
        "min_child_weight":1,"n_round":450}

In [None]:

bst_list,avg_f1_score = k_fold_xgb(df,param,10)

In [None]:
## label prediction for xgb model
pred3 = batch_xgb_predict(bst_list,df_test)

In [None]:
pred_list = np.array([pred1,pred2,pred3])

In [None]:
## Average the predictions
final = np.round(np.average(pred_list,axis=0))

In [None]:

df_test['label'] = final.astype(int)
df_final = df_test.drop(["post","stemmed_post"],axis=1)
df_final = df_final.set_index('id')
df_final.to_csv("test_submit.csv")