### Basic imports

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle

plt.style.use('fivethirtyeight')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix,precision_score,\
recall_score,roc_auc_score,classification_report,fbeta_score,precision_recall_curve,roc_curve,log_loss

In [None]:
import warnings
warnings.simplefilter('ignore') #we don't wanna see that
np.random.seed(1000) 

In [None]:
#importing datasets:
train = pd.read_csv('C:/Users/MBBLABS/Desktop/Python/1. Models/3. Project/Data/less_feature/train.csv',index_col='Unnamed: 0')
test = pd.read_csv('C:/Users/MBBLABS/Desktop/Python/1. Models/3. Project/Data/less_feature/test.csv',index_col='Unnamed: 0')

### Preprocessing: 
#### 1. Changing data type <br> 2. splitting data <br> 3. assigning label

In [None]:
#train data's 'y' is of float type - lets change it's type to integer
train.info()

In [None]:
#displaying the data
display(train.head(2))
test.head(2)

#### Assigning '-1' as label to the unlablled dataset

In [None]:
# train[train['y'].isnull()]['y']
train['y'] = train['y'].fillna(-1)
train['y'].value_counts()

In [None]:
#data type chaged to int32
train['y'] = train['y'].astype('int32')
train.info()

#### -- For training data

In [None]:
#separating X,y:
X = train.iloc[:,:-1] 
y = train.iloc[:,-1] #it's a mixure of all data

#separating X,y with label
X_lbl = train[train['y']!=-1].iloc[:,:-1]
y_lbl = train[train['y']!=-1].iloc[:,-1]
X_lbl.shape,y_lbl.shape

#### -- For test data

In [None]:
X_test = test.iloc[:,:-1] 
y_test = test.iloc[:,-1] 
X_test.shape,y_test.shape

## Logistic Regression

In [None]:
#making dataframe to store results
index = ['Algorithm', 'ROC AUC']
results = pd.DataFrame(columns=index)

In [None]:
#logistic regression:
logreg = LogisticRegression(random_state=1, class_weight='None')
logreg.fit(X_lbl, y_lbl)
results = results.append(pd.Series(['Logistic Regression', roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1])], 
                                   index=index), ignore_index=True)

In [None]:
#displaying result of logistic regression
display(results)

In [None]:
# Predicting on the test data
pred_test = logreg.predict(X_test)

#Calculating and printing the f1 score 
f1_test = f1_score(y_test, pred_test)
print('The f1 score for the testing data:', f1_test)

#Ploting the confusion matrix
sns.heatmap(confusion_matrix(y_test, pred_test),annot=True,fmt='d',cmap = 'Blues')
plt.title('Confusion Matrix',size = 15)
plt.xlabel('Predictions',size =15)
plt.ylabel('True Values',size = 15)

In [None]:
accuracy_score(y_test,pred_test),precision_score(y_test,pred_test),recall_score(y_test,pred_test)

In [None]:
log_loss(y_test,pred_test)

#### Classification Report:

In [None]:
print(classification_report(y_test, pred_test))

#### Threshold tuning:

In [None]:
def to_labels(pos_probs, threshold):
               return (pos_probs >= threshold)
 
y_prob = logreg.predict_proba(X_test)[:,1]
 
thresholds = np.arange(0, 1, 0.001)
scores = [f1_score(y_test, to_labels(y_prob, t)) for t in thresholds]

# get best threshold
ix = np.argmax(scores)
print('Threshold=%.3f, F1-Score=%.5f' % (thresholds[ix], scores[ix]))


plt.plot(thresholds, scores)
plt.title('F1-score vs Threshold ')
plt.xlabel('threshold')
plt.ylabel('F1-score')
plt.show()

In [None]:
y_pred_tuned = to_labels(y_prob,0.443)
print(classification_report(y_test, y_pred_tuned))

In [None]:
#precision 
pr =  precision_score(y_test,y_pred_tuned)
#recall
re = recall_score(y_test,y_pred_tuned)
#accuracy
acc = accuracy_score(y_test,y_pred_tuned)

pr,re,acc

## Label Propagation

In [None]:
def plot_graphs(params_list,kernel,metric_scores,y_test_temp, y_pred,y_test_temp_best, y_prob_best):
    fig, ([a1,a2],[b1,b2], [c1,c2], [d1,d2]) = plt.subplots(4,2,figsize=(20,30))
    
    #plots: 
    #1. ROC-AUC 
    _ = a1.plot(params_list, metric_scores['ROC-AUC_Score'],color = 'steelblue')
    a1.set_xlabel('Gamma')
    a1.set_ylabel('ROC-AUC_Score')
    a1.set_title('Label Propagation ROC AUC with ' + kernel.upper() + ' kernel')
    #a1.savefig('RBF_ROC-AUC.png')

    #2.Threshold
    _ = a2.plot(params_list,metric_scores['Threshold'],color = 'steelblue')
    a2.set_xlabel('Gamma')
    a2.set_ylabel('Threshold')
    a2.set_title('Gamma vs Threshold with ' + kernel.upper() + ' kernel')
    #a1.savefig('RBF_Threshold.png')

    #3.F1,Precision,Recall
    _ = b1.plot(params_list,metric_scores['F1_score'],color = 'steelblue',label = 'F1_Score')
    b1.set_xlabel('Gamma')
    b1.set_ylabel('F1_score')
    b1.set_title('Gamma vs F1_score with ' + kernel.upper() + ' kernel')
    b1.legend(loc="lower right")
    #a1.savefig('RBF_F1_score.png')

    #4.Accuracy
    _ = b2.plot(params_list,metric_scores['Accuracy'],color = 'steelblue',label = 'Accuracy')
    b2.set_xlabel('Gamma')
    b2.set_ylabel('Accuracy')
    b2.set_title('Gamma vs Accuracy with ' + kernel.upper() + ' kernel')
    b2.legend(loc="lower right")
    #a1.savefig('RBF_Accuracy.png')

    #5.Precision
    _ = c1.plot(params_list,metric_scores['Precision'],color = 'steelblue',label = 'Precision')
    c1.set_xlabel('Gamma')
    c1.set_ylabel('Precision')
    c1.set_title('Gamma vs Precision with ' + kernel.upper() + ' kernel')
    c1.legend(loc="lower right")
    #a1.savefig('RBF_Precision.png')

    #6.Recall
    _ = c2.plot(params_list,metric_scores['Recall'],color = 'steelblue',label = 'Recall')
    c2.set_xlabel('Gamma')
    c2.set_ylabel('Recall')
    c2.set_title('Gamma vs Recall with ' + kernel.upper() + ' kernel')
    c2.legend(loc="lower right")
    #a1.savefig('RBF_Recall.png')

    #Precision vs Recall - for BEST model:
    precision, recall, _ = precision_recall_curve(y_test_temp_best, y_prob_best)
    d1.step(recall, precision, color='steelblue',where='post')
    d1.fill_between(recall, precision, step='post', color='lightgray')
    d1.set_title('Precision-Recall Tradeoff')
    d1.set_xlabel('Recall')
    d1.set_ylabel('Precision')
    #a1.savefig('RBF_Precison_Recall_for_best_model.png')
    
    #confusion matrix for best model:
    sns.heatmap(confusion_matrix(y_test_temp, y_pred),annot=True,fmt='d',cmap = 'Blues')
    d2.set_xlabel('True Values')
    d2.set_ylabel('Prediction')
    d2.set_title('Confusion Matrix')
    #d2.savefig('RBF_confusion_matrix_for_best_model.png')
    
    plt.savefig(f'{kernel}_report.png')

In [None]:
def label_prop_test(kernel, params_list, X_train, X_test, y_train, y_test,sampling):
    n, g = 0, 0
    
    def to_labels(pos_probs, threshold):
        return (pos_probs >= threshold)
    thresholds = np.arange(0, 1, 0.01)
      
    
    roc_scores = []
    f1_sc = []
    lst_report = []
    data = []
    f1_best = 0 
    
    if kernel == 'rbf':
        for g in params_list:
            lp = LabelPropagation(kernel=kernel, n_neighbors=n, gamma=g, max_iter=100000, tol=0.0001,n_jobs=-1)
            lp.fit(X_train, y_train)

            #roc_scores
            lst = lp.predict_proba(X_test)[:,1]
            temp = pd.Series(lst)
            temp_lst = temp[temp.isnull()].index.tolist()
            y_prob = np.delete(lst,temp_lst)
            y_test_temp = np.delete(np.array(y_test),temp_lst)
            rc = roc_auc_score(y_test_temp, y_prob)
            
            roc_scores.append(roc_auc_score(y_test_temp, y_prob))

            #thresold tuning:
            scores = [fbeta_score(y_test_temp, to_labels(y_prob, t),beta=1.5) for t in thresholds]
            #get best threshold
            ix = np.argmax(scores)
            t_best = thresholds[ix] #best threshold ------------
            y_pred = to_labels(y_prob, t_best)
            scores[ix] #f1_score (beta)
            
            #f1_score calculation:
            f1_test = f1_score(y_test_temp, y_pred)
            f1_sc.append(f1_test)
            
            #storing the best performing model:
            if f1_test >= f1_best:
                f1_best = f1_test
                lp_best = lp
                y_prob_best = y_prob
                y_test_temp_best = y_test_temp
            

            #classification report:
            lst_report.append(classification_report(y_test_temp, y_pred))
            
            #precision 
            pr =  precision_score(y_test_temp,y_pred)
            #recall
            re = recall_score(y_test_temp,y_pred)
            #accuracy
            acc = accuracy_score(y_test_temp,y_pred)
            
            #log-loss:
            lgloss = log_loss(y_test_temp,y_prob)
            
            #dataframe
            data.append([g,lgloss,rc,t_best,f1_test,acc,pr,re])
        
        
        
        #making a dataframe of scores:
        metric_scores = pd.DataFrame(data,columns=['Gamma','Log_Loss','ROC-AUC_Score','Threshold','F1_score','Accuracy','Precision','Recall'])
        
        
        #exporting metric_scores to csv
        try:
            with pd.ExcelWriter('6.V4_less_variable',engine='openpyxl',if_sheet_exists='replace',mode='a') as writer:
                if sampling == 'Non_Treated':
                    metric_scores.to_excel(writer,engine='openpyxl', sheet_name='RBF_Non_Treated')
                if sampling == 'undersampling':
                    metric_scores.to_excel(writer,engine='openpyxl', sheet_name='RBF_undersampling')
                if sampling == 'oversampling':
                    metric_scores.to_excel(writer,engine='openpyxl', sheet_name='RBF_oversampling')
                if sampling == 'smote':
                    metric_scores.to_excel(writer,engine='openpyxl', sheet_name='RBF_smote')
        
        except:
            with pd.ExcelWriter('6.V4_less_variable.xlsx',engine='openpyxl') as writer:
                if sampling == 'Non_Treated':
                    metric_scores.to_excel(writer,engine='openpyxl', sheet_name='RBF_Non_Treated')
                if sampling == 'undersampling':
                    metric_scores.to_excel(writer,engine='openpyxl', sheet_name='RBF_undersampling')
                if sampling == 'oversampling':
                    metric_scores.to_excel(writer,engine='openpyxl', sheet_name='RBF_oversampling')
                if sampling == 'smote':
                    metric_scores.to_excel(writer,engine='openpyxl', sheet_name='RBF_smote')
        
        
        #ploting graphs:
        plot_graphs(params_list,kernel,metric_scores,y_test_temp, y_pred,y_test_temp_best, y_prob_best)
        
        
        

                       
    if kernel == 'knn':
        for n in params_list:
            lp = LabelPropagation(kernel=kernel, n_neighbors=n, gamma=g, max_iter=100000, tol=0.0001,n_jobs=-1)
            lp.fit(X_train, y_train)
            
            #roc_scores
            lst = lp.predict_proba(X_test)[:,1]
            temp = pd.Series(lst)
            temp_lst = temp[temp.isnull()].index.tolist()
            y_prob = np.delete(lst,temp_lst)
            y_test_temp = np.delete(np.array(y_test),temp_lst)
            roc_scores.append(roc_auc_score(y_test_temp, y_prob)) #---------
            rc = roc_auc_score(y_test_temp, y_prob)
            
            
            #thresold tuning:
            scores = [fbeta_score(y_test_temp, to_labels(y_prob, t),beta=1.5) for t in thresholds]
            # get best threshold
            ix = np.argmax(scores)
            t_best = thresholds[ix] #best threshold ------------
            y_pred = to_labels(y_prob, t_best)
            scores[ix] #f1_score (beta)
            
            #f1_score calculation:
            f1_test = f1_score(y_test_temp, y_pred)
            f1_sc.append(f1_test)
            
            #storing the best performing model:
            if f1_test >= f1_best:
                f1_best = f1_test
                lp_best = lp
                y_prob_best = y_prob
                y_test_temp_best = y_test_temp
            
            
            #classification report:
            lst_report.append(classification_report(y_test_temp, y_pred))
            #precision 
            pr =  precision_score(y_test_temp,y_pred)
            #recall
            re = recall_score(y_test_temp,y_pred)
            #accuracy
            acc = accuracy_score(y_test_temp,y_pred)
            
            #log-loss:
            lgloss = log_loss(y_test_temp,y_prob)
            
            #dataframe
            data.append([n,lgloss,rc,t_best,f1_test,acc,pr,re])
            
        metric_scores = pd.DataFrame(data,columns=['No_of_neighbors','Log_Loss','ROC-AUC_Score','Threshold','F1_score','Accuracy','Precision','Recall'])
        display(metric_scores)
        
        
        #ploting graphs:
        plot_graphs(params_list,kernel,metric_scores,y_test_temp, y_pred,y_test_temp_best, y_prob_best)
        
        #exporting metric_scores to csv
        with pd.ExcelWriter('6.V4_less_variable.xlsx',mode='a',engine='openpyxl',if_sheet_exists="replace") as writer:
            if sampling == 'Non_Treated':
                metric_scores.to_excel(writer,engine='openpyxl', sheet_name='KNN_Non_Treated')
            if sampling == 'undersampling':
                metric_scores.to_excel(writer,engine='openpyxl', sheet_name='KNN_undersampling')
            if sampling == 'oversampling':
                metric_scores.to_excel(writer,engine='openpyxl', sheet_name='KNN_oversampling')
            if sampling == 'smote':
                metric_scores.to_excel(writer,engine='openpyxl', sheet_name='KNN_smote')


    print('Best roc_score is {} at {}'.format(max(roc_scores),params_list[np.argmax(roc_scores)]))
    print('-'*40)

    print('Best f1_score is {} at {}'.format(max(f1_sc),params_list[np.argmax(f1_sc)]))
    print('-'*40)


## Without treating imbalance:

In [None]:
gamma = [1e-5,1e-4,1e-3,1e-2,1e-1,2,4,5,8,10,12]
label_prop_test('rbf',gamma, X, X_test , y, y_test,'Non_Treated')

In [None]:
ns = np.arange(5,50)
label_prop_test('knn',ns, X, X_test , y, y_test,'Non_Treated')

In [None]:
ns = [14]
label_prop_test('knn',ns, X, X_test , y, y_test,'Non_Treated')

### Handling Imbalance of the data:
### 1. Undersampling:

In [None]:
df2 = train
# Class count
count_class_ulbl, count_class_0, count_class_1 = df2.y.value_counts()

# Divide by class
df_class_0 = df2[df2['y'] == 0]
df_class_1 = df2[df2['y'] == 1]
df_class_ulbl = df2[df2['y'] == -1]

In [None]:
df_class_0.shape,df_class_1.shape,df_class_ulbl.shape

In [None]:
count_class_0, count_class_1,count_class_ulbl

In [None]:
# Undersample 0-class and concat the DataFrames of the classes:
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1,df_class_ulbl], axis=0)

print('Random under-sampling:')
print(df_test_under.y.value_counts())

#### Train data:

In [None]:
X = df_test_under.drop('y',axis='columns')
y = df_test_under['y']
X.shape, y.shape

#### Test data:

In [None]:
X_test = test.iloc[:,:-1] 
y_test = test.iloc[:,-1] 
X_test.shape,y_test.shape

In [None]:
y_test.value_counts()

In [None]:
gammas = [1,2,3]
label_prop_test('rbf', gammas, X, X_test, y, y_test,'undersampling')

### 2. Oversampling:

In [None]:
# Oversample 0-class and concat the DataFrames of the classes:
df_class_1_over = df_class_1.sample(count_class_0, replace = True)
df_test_over = pd.concat([df_class_0, df_class_1_over,df_class_ulbl], axis=0)

print('Random under-sampling:')
print(df_test_over.y.value_counts())

In [None]:
X = df_test_over.drop('y',axis='columns')
y = df_test_over['y']
X.shape,y.shape

#### Test data:

In [None]:
X_test = test.iloc[:,:-1] 
y_test = test.iloc[:,-1] 
X_test.shape,y_test.shape

In [None]:
y_test.value_counts()

In [None]:
gammas = [1]
label_prop_test('rbf', gammas, X, X_test, y, y_test,'oversampling')

In [None]:
ns = np.arange(5,10)
label_prop_test('knn', ns, X, X_test , y, y_test,'oversampling')

### 3. SMOTE:  Synthetic Minority Oversampling Technique

In [None]:
#!pip install imbalanced-learn

Concatinating class_0 and class_1 

In [None]:
df_sm = pd.concat([df_class_0,df_class_1],axis=0)

In [None]:
X = df_sm.drop('y',axis='columns')
y = df_sm['y']

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)

y_sm.value_counts()

In [None]:
df_sm1 = pd.concat([X_sm,y_sm],axis = 1)

In [None]:
df_smote = pd.concat([df_sm1,df_class_ulbl],axis = 0)

In [None]:
df_smote.y.value_counts()

#### Train data:

In [None]:
X = df_smote.drop('y',axis='columns')
y = df_smote['y']
X.shape,y.shape

#### Test data:

In [None]:
X_test = test.iloc[:,:-1] 
y_test = test.iloc[:,-1] 
X_test.shape,y_test.shape

In [None]:
gammas = [1]
label_prop_test('rbf', gammas, X, X_test, y, y_test,'smote')

In [None]:
ns = np.arange(5,50)
label_prop_test('knn', ns, X, X_test , y, y_test,'smote')