### Basic imports

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle

import pickle

plt.style.use('fivethirtyeight')

In [None]:
# Visualization
import plotly.express as px

from sklearn.svm import SVC # for Support Vector Classification baseline model
from sklearn.semi_supervised import SelfTrainingClassifier # for Semi-Supervised learning

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix,precision_score,\
recall_score,roc_auc_score,classification_report,fbeta_score,precision_recall_curve,roc_curve,log_loss

In [None]:
import warnings
warnings.simplefilter('ignore') #we don't wanna see that
np.random.seed(1000) 

In [None]:
#importing datasets:
train = pd.read_csv('C:/Users/MBBLABS/Desktop/Python/1. Models/3. Project/Data/less_feature/train.csv',index_col='Unnamed: 0')
test = pd.read_csv('C:/Users/MBBLABS/Desktop/Python/1. Models/3. Project/Data/less_feature/test.csv',index_col='Unnamed: 0')

### Preprocessing: 
#### 1. Changing data type <br> 2. splitting data <br> 3. assigning label

In [None]:
#train data's 'y' is of float type - lets change it's type to integer
train.info()

In [None]:
#displaying the data
display(train.head(2))
test.head(2)

#### Assigning '-1' as label to the unlablled dataset

In [None]:
# train[train['y'].isnull()]['y']
train['y'] = train['y'].fillna(-1)
train['y'].value_counts()

In [None]:
1834/166

In [None]:
#data type chaged to int32
train['y'] = train['y'].astype('int32')
train.info()

#### -- For training data

In [None]:
#separating X,y:
X = train.iloc[:,:-1] 
y = train.iloc[:,-1] #it's a mixure of all data

#separating X,y with label
X_lbl = train[train['y']!=-1].iloc[:,:-1]
y_lbl = train[train['y']!=-1].iloc[:,-1]
X_lbl.shape,y_lbl.shape

#### -- For test data

In [None]:
X_test = test.iloc[:,:-1] 
y_test = test.iloc[:,-1] 
X_test.shape,y_test.shape

## Logistic Regression

In [None]:
#making dataframe to store results
index = ['Algorithm', 'ROC AUC']
results = pd.DataFrame(columns=index)

In [None]:
#logistic regression:
logreg = LogisticRegression(random_state=1, class_weight='None')
logreg.fit(X_lbl, y_lbl)
results = results.append(pd.Series(['Logistic Regression', roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1])], 
                                   index=index), ignore_index=True)

In [None]:
#displaying result of logistic regression
display(results)

In [None]:
# Predicting on the test data
pred_test = logreg.predict(X_test)

#Calculating and printing the f1 score 
f1_test = f1_score(y_test, pred_test)
print('The f1 score for the testing data:', f1_test)

#Ploting the confusion matrix
sns.heatmap(confusion_matrix(y_test, pred_test),annot=True,fmt='d',cmap = 'Blues')

#### Classification Report:

In [None]:
print(classification_report(y_test, pred_test))

#### Threshold tuning:

In [None]:
def to_labels(pos_probs, threshold):
               return (pos_probs >= threshold)
 
y_prob = logreg.predict_proba(X_test)[:,1]
 
thresholds = np.arange(0, 1, 0.01)
scores = [f1_score(y_test, to_labels(y_prob, t)) for t in thresholds]

# get best threshold
ix = np.argmax(scores)
print('Threshold=%.3f, F1-Score=%.5f' % (thresholds[ix], scores[ix]))


plt.plot(thresholds, scores)
plt.title('F1-score vs Threshold ')
plt.xlabel('threshold')
plt.ylabel('F1-score')
plt.show()

In [None]:
y_pred_tuned = to_labels(y_prob, 0.430)
print(classification_report(y_test, y_pred_tuned))

## Self-Training Classifier

In [None]:
lgs =  LogisticRegression()
lgs.fit(X_lbl,y_lbl)

In [None]:
#separating unlabled data
X_unl = X[2000:]

In [None]:
#predictions and probability
probs = lgs.predict_proba(X_unl)
label = lgs.predict(X_unl)

In [None]:
df = pd.DataFrame(probs, columns = ['0-Prob', '1-Prob']) 
df['label']=label
df['max']=df[["0-Prob", "1-Prob"]].max(axis=1)

In [None]:
df['1-Prob'].describe()

In [None]:
df['0-Prob'].describe()

In [None]:
df['label'].value_counts()

In [None]:
df['max'].describe()

In [None]:
df

In [None]:
nc=np.arange(.5,1,.0001)
f1_max = 0 
acc=[]
i=0
for k in np.nditer(nc):
    conf_ind= df['max']>k
    X_train1 = np.append(X_lbl,X_unl[conf_ind.values],axis=0)
    y_train1 = np.append(y_lbl,df[conf_ind.values]['label'])
    
    lgs.fit(X_train1, y_train1)
    f1 = f1_score(y_test,lgs.predict(X_test))
    if f1 >= f1_max:
        s = pickle.dumps(lgs)
        f1_max = f1
        k_best = k
    acc.append(f1)
    i = i + 1

In [None]:
k_best

In [None]:
max(acc)

In [None]:
cls = pickle.loads(s)

In [None]:
pred = cls.predict(X_test)
f1_score(y_test,pred)

In [None]:
accuracy_score(y_test,pred),precision_score(y_test,pred),recall_score(y_test,pred),f1_score(y_test,pred)

In [None]:
sns.heatmap(confusion_matrix(y_test, pred),annot=True,fmt='d',cmap = 'Blues')
plt.title('Confusion Matrix',size = 15)
plt.xlabel('Predictions',size =15)
plt.ylabel('True Values',size = 15)

In [None]:
print(classification_report(y_test,pred))

In [None]:
print(f'accuracy score: {accuracy_score(y_test,clf_ST.predict(X_test))},\nprecision:{precision_score(y_test,clf_ST.predict(X_test))},\nrecall: {recall_score(y_test,clf_ST.predict(X_test))}')

In [None]:
sns.heatmap(confusion_matrix(y_test, clf_ST.predict(X_test)),annot=True,fmt='d',cmap = 'Blues')

In [None]:
fig, ([a1,a2]) = plt.subplots(1,2,figsize=(10,4))

sns.heatmap(confusion_matrix(y_test, pred_test),annot=True,fmt='d',cmap = 'Blues',ax = a1)
a1.set_title('Surpervised_threshold_tuned')

sns.heatmap(confusion_matrix(y_test, clf_ST.predict(X_test)),annot=True,fmt='d',cmap = 'Blues',ax = a2)
a2.set_title('Self_training_classifier')

#### Was testing by only appending '1' class

In [None]:
nc=np.arange(.03,1,.001)
f1_max = 0 
acc=[]
i=0
for k in np.nditer(nc):
    conf_ind= df["1-Prob"]>k
    df.iloc[conf_ind.values,2] = 1
    X_train1 = np.append(X_lbl,X_unl[conf_ind.values],axis=0)
    y_train1 = np.append(y_lbl,df[conf_ind.values]['label'])
    
    conf_ind= df["0-Prob"]>k
    X_train1 = np.append(X_train1,X_unl[conf_ind.values],axis=0)
    y_train1 = np.append(y_train1,df[conf_ind.values]['label'])
    
    lgs.fit(X_train1, y_train1)
    f1 = f1_score(y_test,lgs.predict(X_test))
    if f1 >= f1_max:
        s = pickle.dumps(lgs)
        f1_max = f1
        k_best = k
    acc.append(f1)
    i = i + 1