Importing libraries and train data from google drive

In [50]:
import pandas as pd
import pickle
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import *
from sklearn.svm import SVC

import math
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt



# Preprocessing 

In [None]:
from google.colab import drive
drive.mount('/content/drive/virufy/developers/text/data')

In [20]:
def preProcessing(inFile):
  """
  Preprocess the input file to the standard format. 
  Parameter:
    inFile: string containing the input file name in .csv format
  Returns: target labels value (in 1 or 0) and preprocessed dataframe 
  """
  labels_f = 'trainText.csv'
  labels_df = pd.read_csv(labels_f)
  #dropping the columns unrealted to the text model
  labels_df = labels_df.drop(columns=["date", "cough_filename", 'Unnamed: 8'])

  #removing the columns which does not contain the information on covid tests
  labels_df["corona_test"].fillna("None", inplace = True) 
  labels_df = labels_df.drop(labels_df[labels_df.corona_test == "None"].index)

  #replacing the empty values with None
  labels_df ["smoker"].fillna("None", inplace = True) 
  labels_df ["patient_reported_symptoms"].fillna("None", inplace = True)
  labels_df ['age'].fillna("None", inplace = True)
  labels_df ['gender'].fillna("None", inplace = True)
  labels_df ['medical_history'].fillna("None", inplace = True)


  #CORONA_TEST
  newdf= labels_df.replace(to_replace ="negative", 
                  value =0)
  newdf = newdf.replace(to_replace ="positive", 
                  value =1)
  newdf= newdf.replace(to_replace ="FALSE", 
                  value =0)
  newdf = newdf.replace(to_replace ="TRUE", 
                  value =1)

  #AGE
  newdf['age'] = LabelEncoder().fit_transform(newdf['age'])

  #Gender
  newdf['gender'] = newdf['gender'].str.lower()
  newdf['gender'] = LabelEncoder().fit_transform(newdf['gender'])

  #medical_history
  newdf["medical_history"] = newdf["medical_history"].str.lower()
  med_history = ['None', 'Congestive heart failure','Disease or conditions that make it harder to cough,' \
                      'Asthma or chronic lung disease','pregnancy,', 'Diabetes with complications']
      
  for mh in med_history:
      newdf[mh] = newdf.medical_history.str.contains(mh).astype(int)
      newdf["medical_history"] = newdf.medical_history.str.replace(mh+ ",", "")

  newdf = newdf.drop(columns = ["medical_history"])

  #smoker
  newdf['smoker'] = LabelEncoder().fit_transform(newdf['smoker'])

  #symptoms
  newdf['patient_reported_symptoms'] = newdf['patient_reported_symptoms'].str.lower()

  symptoms = ['Fever, chills, or sweating', 'Shortness of breath', \
                    'New or worsening cough','Sore throat', 'Body aches', \
                    'Loss of taste', 'Loss of smell', 'None']
  for ps in symptoms:
      newdf[ps] = newdf.patient_reported_symptoms.str.contains(ps).astype(int)
      newdf["patient_reported_symptoms"] = newdf.patient_reported_symptoms.str.replace(ps+ ",", "")

  newdf = newdf.drop(columns = ["patient_reported_symptoms"])

  target_labels = newdf["corona_test"] 
  newdf = newdf.drop(columns = ["corona_test"])

  return target_labels,newdf

# **Training**

In [23]:
class trainText():
  """
  This is a class for training the text model data using trainData provided
  Attributes: 
  trainData : string in the form .csv file containg training data.
  """
  def __init__(self,trainData):
    self.trainData = trainData

  def prepData(self):
    """
    Prepare data for text model
    Returns: 
      x_train, y_train is the train data prepared
    """
    targetLabels, processedDf = preProcessing(self.trainData)
    x_train_orig, x_test, y_train_orig, y_test = train_test_split(processedDf,targetLabels, test_size=0.15, shuffle=True)
    x_train, y_train = SMOTE(sampling_strategy='minority').fit_resample(x_train_orig, y_train_orig)
    return x_train, y_train, x_test, y_test

  def trainModel(self):
    """
    Trains the text model using the trainData 
    Returns:
      a string comting saved model of form .sav
    """
    X_train,y_train,x_test, y_test = self.prepData()
    clf = SVC(kernel = "linear",C=1, degree = 2, gamma=0.001,random_state=0)
    clf.fit(X_train,y_train)

    print("\nTraining:")
    print("Accuracy: ",end="")
    pred = clf.predict(X_train)
    accuracy = accuracy_score(y_train, pred)
    print(accuracy)
    conf_mat = confusion_matrix(y_train, pred)
    print(conf_mat)
    print("\nTesting:")
    print("Accuracy: ",end="")
    pred = clf.predict(x_test)
    accuracy = accuracy_score(y_test, pred)
    print(accuracy)
    conf_mat = confusion_matrix(y_test, pred)
    print(conf_mat)
    print(classification_report(y_test, pred))
    print("\n\n")

    filename = 'textModelSVC.sav'
    pickle.dump(clf, open(filename, 'wb'))

    return filename





# Prediction

In [43]:
class predText():
  """
  This is a class for testing the text model data using testData provided
  Attributes: 
  testData : string in the form .csv file containg training data.
  """
  def __init__(self,testData, modelFile):
    self.trainData = testData
    self.modelfile = modelFile

  def prepData(self):
    """
    Prepare data for text model
    Returns: 
      targetLabels, processedDf is the test data prepared for model
    """
    targetLabels, processedDf = preProcessing(self.trainData)
    return targetLabels, processedDf

  def predict(self):
    """
    Predict for covid positive or negative using the saved model 
    Returns: 
      bool value depecting positive and negative covid results 
    """
    y_test, x_test,  = self.prepData()
    loaded_model = pickle.load(open(self.modelfile, 'rb'))
    pred = loaded_model.predict(x_test)
    print(pred)
    accuracy = accuracy_score(y_test,pred)
    print("Accuracy on test dataset : ", accuracy)
    conf_mat = confusion_matrix(y_test,pred)
    print("Confusion matrix :\n", conf_mat)
    print("Classification report \n", classification_report(y_test, pred))
    return pred

In [None]:
#text_fp = '/content/drive/My Drive/virufy/developers/text/data/text-50.csv'
#text_model_save_fp = '/content/drive/My Drive/virufy/developers/text/models/text-model.csv'
#test_fp = "testTrain.csv"
#train_fp = "tarinText.csv"
#if __name__ == "__main__":
#  from google.colab import drive
#  drive.mount('/content/drive')

In [46]:
if __name__ == "__main__":
  #training
  eq = Train("trainText.csv")
  modelFile = eq.trainModel()
  print(modelFile)
  #prediction 
  test_eq = predText("testText.csv","textModelSVC.sav")
  ans = test_eq.predict()


Training:
Accuracy: 0.83298755186722
[[385  97]
 [ 64 418]]

Testing:
Accuracy: 0.8387096774193549
[[75 14]
 [ 1  3]]
              precision    recall  f1-score   support

           0       0.99      0.84      0.91        89
           1       0.18      0.75      0.29         4

    accuracy                           0.84        93
   macro avg       0.58      0.80      0.60        93
weighted avg       0.95      0.84      0.88        93




textModelSVC.sav
[0 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 



# Ensemble

In [47]:
if __name__ == "__main__":
  test_eq = predText("testText.csv","textModelSVC.sav")
  ans = test_eq.predict()

[0 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 