### Basic imports

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle

plt.style.use('fivethirtyeight')

In [None]:
# Visualization
import plotly.express as px

from sklearn.svm import SVC # for Support Vector Classification baseline model
from sklearn.semi_supervised import SelfTrainingClassifier # for Semi-Supervised learning

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix,precision_score,\
recall_score,roc_auc_score,classification_report,fbeta_score,precision_recall_curve,roc_curve,log_loss

In [None]:
import warnings
warnings.simplefilter('ignore') #we don't wanna see that
np.random.seed(1000) 

In [None]:
#importing datasets:
train = pd.read_csv('C:/Users/MBBLABS/Desktop/Python/1. Models/3. Project/Data/less_feature/train.csv',index_col='Unnamed: 0')
test = pd.read_csv('C:/Users/MBBLABS/Desktop/Python/1. Models/3. Project/Data/less_feature/test.csv',index_col='Unnamed: 0')

### Preprocessing: 
#### 1. Changing data type <br> 2. splitting data <br> 3. assigning label

In [None]:
#train data's 'y' is of float type - lets change it's type to integer
train.info()

In [None]:
#displaying the data
display(train.head(2))
test.head(2)

#### Assigning '-1' as label to the unlablled dataset

In [None]:
# train[train['y'].isnull()]['y']
train['y'] = train['y'].fillna(-1)
train['y'].value_counts()

In [None]:
1834/166

In [None]:
#data type chaged to int32
train['y'] = train['y'].astype('int32')
train.info()

#### -- For training data

In [None]:
#separating X,y:
X = train.iloc[:,:-1] 
y = train.iloc[:,-1] #it's a mixure of all data

#separating X,y with label
X_lbl = train[train['y']!=-1].iloc[:,:-1]
y_lbl = train[train['y']!=-1].iloc[:,-1]
X_lbl.shape,y_lbl.shape

#### -- For test data

In [None]:
X_test = test.iloc[:,:-1] 
y_test = test.iloc[:,-1] 
X_test.shape,y_test.shape

## Logistic Regression

In [None]:
#making dataframe to store results
index = ['Algorithm', 'ROC AUC']
results = pd.DataFrame(columns=index)

In [None]:
#logistic regression:
logreg = LogisticRegression(random_state=1, class_weight='None')
logreg.fit(X_lbl, y_lbl)
results = results.append(pd.Series(['Logistic Regression', roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1])], 
                                   index=index), ignore_index=True)

In [None]:
#displaying result of logistic regression
display(results)

In [None]:
# Predicting on the test data
pred_test = logreg.predict(X_test)

#Calculating and printing the f1 score 
f1_test = f1_score(y_test, pred_test)
print('The f1 score for the testing data:', f1_test)

#Ploting the confusion matrix
sns.heatmap(confusion_matrix(y_test, pred_test),annot=True,fmt='d',cmap = 'Blues')

#### Classification Report:

In [None]:
print(classification_report(y_test, pred_test))

#### Threshold tuning:

In [None]:
def to_labels(pos_probs, threshold):
               return (pos_probs >= threshold)
 
y_prob = logreg.predict_proba(X_test)[:,1]
 
thresholds = np.arange(0, 1, 0.01)
scores = [f1_score(y_test, to_labels(y_prob, t)) for t in thresholds]

# get best threshold
ix = np.argmax(scores)
print('Threshold=%.3f, F1-Score=%.5f' % (thresholds[ix], scores[ix]))


plt.plot(thresholds, scores)
plt.title('F1-score vs Threshold ')
plt.xlabel('threshold')
plt.ylabel('F1-score')
plt.show()

In [None]:
y_pred_tuned = to_labels(y_prob, 0.430)
print(classification_report(y_test, y_pred_tuned))

In [None]:
#precision 
pr =  precision_score(y_test,y_pred_tuned)
#recall
re = recall_score(y_test,y_pred_tuned)
#accuracy
acc = accuracy_score(y_test,y_pred_tuned)

pr,re,acc

## Self-Training Classifier

In [None]:
model_svc =  LogisticRegression()

# Specify Self-Training model parameters
self_training_model = SelfTrainingClassifier(base_estimator=model_svc,
                                             threshold=0.95, # default=0.75, The decision threshold for use with criterion='threshold'. Should be in [0, 1).
                                             criterion='threshold', # {‘threshold’, ‘k_best’}, default=’threshold’, The selection criterion used to select which labels to add to the training set. If 'threshold', pseudo-labels with prediction probabilities above threshold are added to the dataset. If 'k_best', the k_best pseudo-labels with highest prediction probabilities are added to the dataset.
                                             #k_best=5, # default=10, The amount of samples to add in each iteration. Only used when criterion='k_best'.
                                             max_iter=500, # default=10, Maximum number of iterations allowed. Should be greater than or equal to 0. If it is None, the classifier will continue to predict labels until no new pseudo-labels are added, or all unlabeled samples have been labeled.
                                             verbose=True 
                                            )

# Fit the model
clf_ST = self_training_model.fit(X, y)

In [None]:
# Model Evaluation 
print('')
print('---------- Self Training Model - Summary ----------')
print('Base Estimator: ', clf_ST.base_estimator_)
print('Classes: ', clf_ST.classes_)
print('Transduction Labels: ', clf_ST.transduction_)
#print('Iteration When Sample Was Labeled: ', clf_ST.labeled_iter_)
print('Number of Features: ', clf_ST.n_features_in_)
print('Feature Names: ', clf_ST.feature_names_in_)
print('Number of Iterations: ', clf_ST.n_iter_)
print('Termination Condition: ', clf_ST.termination_condition_)
print('')

print('---------- Self Training Model - Evaluation on Test Data ----------')
accuracy_score_ST = clf_ST.score(X_test, y_test)
print('Accuracy Score: ', accuracy_score_ST)
# Look at classification report to evaluate the model
print(classification_report(y_test, clf_ST.predict(X_test)))

In [None]:
print(f'accuracy score: {accuracy_score(y_test,clf_ST.predict(X_test))},\nprecision:{precision_score(y_test,clf_ST.predict(X_test))},\nrecall: {recall_score(y_test,clf_ST.predict(X_test))}')

In [None]:
model_svc = SVC(kernel='rbf', 
                probability=True, # Need to enable to be able to use predict_proba
                C=1.0, # default = 1.0
                gamma='scale', # default = 'scale',
                random_state=0
               )

# Specify Self-Training model parameters
self_training_model = SelfTrainingClassifier(base_estimator=model_svc,
                                             threshold=.9, # default=0.75, The decision threshold for use with criterion='threshold'. Should be in [0, 1).
                                             criterion='threshold', # {‘threshold’, ‘k_best’}, default=’threshold’, The selection criterion used to select which labels to add to the training set. If 'threshold', pseudo-labels with prediction probabilities above threshold are added to the dataset. If 'k_best', the k_best pseudo-labels with highest prediction probabilities are added to the dataset.
                                             #k_best=50, # default=10, The amount of samples to add in each iteration. Only used when criterion='k_best'.
                                             max_iter=100, # default=10, Maximum number of iterations allowed. Should be greater than or equal to 0. If it is None, the classifier will continue to predict labels until no new pseudo-labels are added, or all unlabeled samples have been labeled.
                                             verbose=True 
                                            )

# Fit the model
clf_ST = self_training_model.fit(X, y)

In [None]:
# Model Evaluation 
print('')
print('---------- Self Training Model - Summary ----------')
print('Base Estimator: ', clf_ST.base_estimator_)
print('Classes: ', clf_ST.classes_)
print('Transduction Labels: ', clf_ST.transduction_)
#print('Iteration When Sample Was Labeled: ', clf_ST.labeled_iter_)
print('Number of Features: ', clf_ST.n_features_in_)
print('Feature Names: ', clf_ST.feature_names_in_)
print('Number of Iterations: ', clf_ST.n_iter_)
print('Termination Condition: ', clf_ST.termination_condition_)
print('')

print('---------- Self Training Model - Evaluation on Test Data ----------')
accuracy_score_ST = clf_ST.score(X_test, y_test)
print('Accuracy Score: ', accuracy_score_ST)
# Look at classification report to evaluate the model
print(classification_report(y_test, clf_ST.predict(X_test)))

In [None]:
sns.heatmap(confusion_matrix(y_test, clf_ST.predict(X_test)),annot=True,fmt='d',cmap = 'Blues')

In [None]:
fig, ([a1,a2]) = plt.subplots(1,2,figsize=(10,4))

sns.heatmap(confusion_matrix(y_test, pred_test),annot=True,fmt='d',cmap = 'Blues',ax = a1)
a1.set_title('Surpervised_threshold_tuned')

sns.heatmap(confusion_matrix(y_test, clf_ST.predict(X_test)),annot=True,fmt='d',cmap = 'Blues',ax = a2)
a2.set_title('Self_training_classifier')