In [None]:
import numpy as np
import pandas as pd
import json

def get_features(features_file, targets_file, train_val_test_indices_file):
  
  feats = []
  train_feats = []
  train_labels = []
  val_feats = []
  val_labels = []
  test_feats = []
  test_labels = []

  # Load train val test indices file
  f = open(train_val_test_indices_file)
  c_indices = json.load(f)

  # Load targets file
  c_targets = np.load(targets_file)
  train_labels = np.take(c_targets, c_indices['train'], axis=0)
  val_labels = np.take(c_targets, c_indices['val'], axis=0)
  test_labels = np.take(c_targets, c_indices['test'], axis=0)
  
  if (features_file.find('.csv') != -1):   # Filename has .csv 
    # Load features file
    df = pd.read_csv(features_file) # Read csv into pandas dataframe
    df = df.iloc[:,2:] # Remove the first two columns (indices,title text)
    feats = df.to_numpy() # Convert data to numpy        
  else :
    feats = np.load(features_file)

  train_feats = np.take(feats, c_indices['train'], axis=0)
  val_feats = np.take(feats, c_indices['val'], axis=0)
  test_feats = np.take(feats, c_indices['test'], axis=0)
    

  return (train_feats,train_labels,val_feats,val_labels,test_feats,test_labels)



In [None]:
def fmtF1Score(f1Score):
  return str(format(f1Score,".3f"))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

def train_and_eval(model_name,model_hypers,train_feats,train_labels,val_feats,val_labels):
  best_val_scores = 0
  best_val_model_params = [0]

  #print(train_feats.shape)
  #print(val_feats.shape)
 
  if( model_name == 'knn') :
    k_range = model_hypers['k_range']
    bestScore = -1
    bestK = -1
    for k in k_range:     
      model = KNeighborsClassifier(n_neighbors=k)
      model.fit(train_feats,train_labels)
      y_pred = model.predict(val_feats)      
      f1Score =  f1_score(val_labels, y_pred,average='micro')    
      print("KNN: k="+str(k)+" f1 score="+fmtF1Score(f1Score)) 
      if f1Score > bestScore :
         bestScore = f1Score
         bestK= k
    best_val_scores = bestScore 
    best_val_model_params = bestK
  elif( model_name == 'svm') :  
    C = model_hypers['C']
    gamma = model_hypers['gamma']
    kernels = model_hypers['kernel']
    degree = model_hypers['degree']
    bestScore = -1  
    best_params = {}
    best_kernel = ''
    for k in kernels:
      if ( k == 'rbf'):
        for c in C:
          for g in gamma:
            model = SVC(kernel='rbf', probability=True, C=c, gamma=g)
            model.fit(train_feats,train_labels)
            y_pred = model.predict(val_feats)      
            f1Score =  f1_score(val_labels, y_pred,average='micro')
            
            print('SVM: kernel='+k+';c='+str(C)+';gamma='+str(g)+';score='+fmtF1Score(f1Score))
            if f1Score > bestScore :
              bestScore = f1Score              
              best_params['C'] = c
              best_params['gamma'] = g
              best_params['kernel'] = k
              #print(confusion_matrix(val_labels, y_pred))
              #best_params['confmatrix'] = confusion_matrix(val_labels, y_pred)
              
      elif (k == 'poly'):
        for d in degree:
          model = SVC(kernel=k, degree=d)
          model.fit(train_feats,train_labels)
          y_pred = model.predict(val_feats)      
          f1Score =  f1_score(val_labels, y_pred,average='micro')
          print('SVM: kernel='+k+';d='+str(d)+';score='+fmtF1Score(f1Score))
          if f1Score > bestScore :
            bestScore = f1Score
            best_params['kernel'] = k
            best_params['degree'] = d
            
      else:
        model = SVC(kernel=k,gamma='auto')
        model.fit(train_feats,train_labels)
        y_pred = model.predict(val_feats)       
        f1Score =  f1_score(val_labels, y_pred,average='micro')
        print('SVM: kernel=sigmoid;score='+fmtF1Score(f1Score))
        if f1Score > bestScore :
          bestScore = f1Score
          best_params['kernel'] = k
          

    best_val_scores = bestScore 
    best_val_model_params = best_params
    #print('svm-BEST-score:'+str(best_val_scores))
    #print(best_params)
  elif( model_name == 'xgboost') :
    model = XGBClassifier()  
    model.fit(train_feats,train_labels)
    y_pred = model.predict(val_feats)      
    best_val_scores =  f1_score(val_labels, y_pred,average='micro')
  elif( model_name == 'lightgbm') :
    model = LGBMClassifier()  
    model.fit(train_feats,train_labels)
    y_pred = model.predict(val_feats)      
    best_val_scores =  f1_score(val_labels, y_pred,average='micro')
  elif( model_name == 'mlp') :
    bestScore = -1
    best_params = [0]
    h_lists = model_hypers['h'] 
    for h in h_lists:
      hn = np.array(h)      
      num_inputs = np.shape(train_feats)[1]   #get the number of columns in the trainig features
      hn = np.ceil(hn * num_inputs)           #get celing of the input element
      hn = hn.astype(int)                     #casting the object as int only
      hnt = tuple(hn)                  
      model = MLPClassifier(hidden_layer_sizes=hnt, activation='relu', solver='adam', max_iter=500) 
      model.fit(train_feats,train_labels)      
      y_pred = model.predict(val_feats)         
      f1Score =  f1_score(val_labels, y_pred,average='micro')      
      print('MLP: params='+str(h)+';score='+fmtF1Score(f1Score))
      if f1Score > bestScore :
        bestScore = f1Score
        best_params = h          

    best_val_scores = bestScore
    best_val_model_params = best_params


  return (best_val_scores,best_val_model_params)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

def test_on_combined_train_val(model_name,best_val_model_params,train_feats,train_labels,test_feats,test_labels):
  final_test_score = 0
  final_model = []

  #print(train_feats.shape)
  #print(test_feats.shape)
   
  if( model_name == 'knn') :
    k = best_val_model_params
    model = KNeighborsClassifier(n_neighbors=k)  
  elif( model_name == 'svm') :
      if(best_val_model_params['kernel'] == 'rbf'):
          c = best_val_model_params['C']
          g = best_val_model_params['gamma']
          model = SVC(kernel='rbf', probability=True, C=c, gamma=g)  
      elif (best_val_model_params['kernel'] == 'poly'):
        d = best_val_model_params['degree']
        model = SVC(kernel='poly', degree=d)        
      else: #(best_val_model_params['kernel'] == 'sigmoid')
        model = SVC(kernel='sigmoid',gamma='auto')
  elif( model_name == 'xgboost') :
    model = XGBClassifier()  
  elif( model_name == 'lightgbm') :        
    model = LGBMClassifier()  
  elif( model_name == 'mlp') :    
    h = best_val_model_params
    hn = np.array(h)      
    num_inputs = np.shape(test_feats)[1]
    hn = np.ceil(hn * num_inputs)
    hn = hn.astype(int)      
    hnt = tuple(hn)                  
    model = MLPClassifier(hidden_layer_sizes=hnt, activation='relu', solver='adam', max_iter=1000) 

  model.fit(train_feats,train_labels)    
  y_pred = model.predict(test_feats)      
  print(confusion_matrix(test_labels, y_pred))
  final_test_score =  f1_score(test_labels, y_pred,average='micro')     
  final_model = model    
   
  return (final_test_score,final_model)