Importing libraries and train data from google drive

In [None]:
import pandas as pd
import pickle
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import *
from sklearn.svm import SVC

import math
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt



  import pandas.util.testing as tm


In [None]:
from google.colab import drive
drive.mount('/content/drive/virufy/developers/text/data')

ValueError: ignored

# **Main class**

In [40]:
class Text():
  """
  This is a class for training , preprcoessing data and testing the text model data using testData provided
  """

  """
  def __init__(self, trainData, testData):
    self.trainData = trainData
    self.testData = testData
  """

  def preProcessing(self,inFile):
    """
    Preprocess the input file to the standard format. 
    Parameter:
      inFile: string containing the input file name in .csv format
    Returns: target labels value (in 1 or 0) and preprocessed dataframe 
    """
    labels_f = inFile
    labels_df = pd.read_csv(labels_f)
    #dropping the columns unrealted to the text model
    labels_df = labels_df.drop(columns=["date", "cough_filename"])

    #removing the columns which does not contain the information on covid tests
    labels_df["corona_test"].fillna("None", inplace = True) 
    labels_df = labels_df.drop(labels_df[labels_df.corona_test == "None"].index)

    #replacing the empty values with None
    labels_df ["smoker"].fillna("None", inplace = True) 
    labels_df ["patient_reported_symptoms"].fillna("None", inplace = True)
    labels_df ['age'].fillna("None", inplace = True)
    labels_df ['gender'].fillna("None", inplace = True)
    labels_df ['medical_history'].fillna("None", inplace = True)


    #CORONA_TEST
    newdf= labels_df.replace(to_replace ="negative", 
                    value =0)
    newdf = newdf.replace(to_replace ="positive", 
                    value =1)
    newdf= newdf.replace(to_replace ="FALSE", 
                    value =0)
    newdf = newdf.replace(to_replace ="TRUE", 
                    value =1)

    #AGE
    newdf['age'] = LabelEncoder().fit_transform(newdf['age'])

    #Gender
    newdf['gender'] = newdf['gender'].str.lower()
    newdf['gender'] = LabelEncoder().fit_transform(newdf['gender'])

    #medical_history
    newdf["medical_history"] = newdf["medical_history"].str.lower()
    med_history = ['None', 'Congestive heart failure','Disease or conditions that make it harder to cough,' \
                        'Asthma or chronic lung disease','pregnancy,', 'Diabetes with complications']
        
    for mh in med_history:
        newdf[mh] = newdf.medical_history.str.contains(mh).astype(int)
        newdf["medical_history"] = newdf.medical_history.str.replace(mh+ ",", "")

    newdf = newdf.drop(columns = ["medical_history"])

    #smoker
    newdf['smoker'] = LabelEncoder().fit_transform(newdf['smoker'])

    #symptoms
    newdf['patient_reported_symptoms'] = newdf['patient_reported_symptoms'].str.lower()

    symptoms = ['Fever, chills, or sweating', 'Shortness of breath', \
                      'New or worsening cough','Sore throat', 'Body aches', \
                      'Loss of taste', 'Loss of smell', 'None']
    for ps in symptoms:
        newdf[ps] = newdf.patient_reported_symptoms.str.contains(ps).astype(int)
        newdf["patient_reported_symptoms"] = newdf.patient_reported_symptoms.str.replace(ps+ ",", "")

    newdf = newdf.drop(columns = ["patient_reported_symptoms"])

    target_labels = newdf["corona_test"] 
    newdf = newdf.drop(columns = ["corona_test"])

    return target_labels, newdf


  def prepDataTrain(self, trainData):
    """
    Prepare data for text model
    Returns: 
      x_train, y_train is the train data prepared
    """
    targetLabels, processedDf = self.preProcessing(trainData)
    x_train_orig, x_test, y_train_orig, y_test = train_test_split(processedDf,targetLabels, test_size=0.15, shuffle=True)
    x_train, y_train = SMOTE(sampling_strategy='minority').fit_resample(x_train_orig, y_train_orig)
    return x_train, y_train, x_test, y_test

  def trainModel(self, trainData):
    """
    Trains the text model using the trainData 
    Returns:
      a string containing saved model of form .sav
    """
    X_train, y_train, x_test, y_test = self.prepDataTrain(trainData)
    clf = SVC(kernel = "linear",C=1, degree = 2, gamma=0.001,random_state=0)
    clf.fit(X_train,y_train)

    print("\nTraining:")
    print("Accuracy: ",end="")
    pred = clf.predict(X_train)
    accuracy = accuracy_score(y_train, pred)
    print(accuracy)
    conf_mat = confusion_matrix(y_train, pred)
    print(conf_mat)
    print("\nTesting:")
    print("Accuracy: ",end="")
    pred = clf.predict(x_test)
    accuracy = accuracy_score(y_test, pred)
    print(accuracy)
    conf_mat = confusion_matrix(y_test, pred)
    print(conf_mat)
    print(classification_report(y_test, pred))
    print("\n\n")

    filename = 'textModelSVC.sav'
    pickle.dump(clf, open(filename, 'wb'))

    return filename
  

  def prepDataTest(self, testData):
    """
    Prepare data for text model
    Returns: 
      targetLabels, processedDf is the test data prepared for model
    """
    targetLabels, processedDf = self.preProcessing(testData)
    return targetLabels, processedDf

  def predict(self, testData, modelFile):
    """
    Predict for covid positive or negative using the saved model 
    Returns: 
      bool value depecting positive and negative covid results 
    """
    y_test, x_test,  = self.prepDataTest(testData)
    loaded_model = pickle.load(open(modelFile, 'rb'))
    pred = loaded_model.predict(x_test)
    print(pred)
    accuracy = accuracy_score(y_test,pred)
    print("Accuracy on test dataset : ", accuracy)
    conf_mat = confusion_matrix(y_test,pred)
    print("Confusion matrix :\n", conf_mat)
    print("Classification report \n", classification_report(y_test, pred))
    return pred

In [42]:
if __name__ == "__main__":
  #training
  eq = Text()
  trainData = "trainText.csv"
  testData = "testText.csv"
  modelFile = eq.trainModel(trainData)
  #prediction 
  test_eq = eq.predict(testData,modelFile)



Training:
Accuracy: 0.7880658436213992
[[388  98]
 [108 378]]

Testing:
Accuracy: 0.8602150537634409
[[72 13]
 [ 0  8]]
              precision    recall  f1-score   support

           0       1.00      0.85      0.92        85
           1       0.38      1.00      0.55         8

    accuracy                           0.86        93
   macro avg       0.69      0.92      0.73        93
weighted avg       0.95      0.86      0.89        93




[0 1 0 0 0 1 1 1 1 1 1 1 1 1 1]
Accuracy on test dataset :  0.4666666666666667
Confusion matrix :
 [[4 8]
 [0 3]]
Classification report 
               precision    recall  f1-score   support

           0       1.00      0.33      0.50        12
           1       0.27      1.00      0.43         3

    accuracy                           0.47        15
   macro avg       0.64      0.67      0.46        15
weighted avg       0.85      0.47      0.49        15





# Ensemble

In [44]:
if __name__ == "__main__":
  text = Text()
  textModel = text.trainModel("trainText.csv")
  prediction = text.predict("testText.csv",textModel)
  


Training:
Accuracy: 0.8167701863354038
[[392  91]
 [ 86 397]]

Testing:
Accuracy: 0.7849462365591398
[[68 20]
 [ 0  5]]
              precision    recall  f1-score   support

           0       1.00      0.77      0.87        88
           1       0.20      1.00      0.33         5

    accuracy                           0.78        93
   macro avg       0.60      0.89      0.60        93
weighted avg       0.96      0.78      0.84        93




[0 1 0 0 0 1 1 1 1 1 1 1 1 1 1]
Accuracy on test dataset :  0.4666666666666667
Confusion matrix :
 [[4 8]
 [0 3]]
Classification report 
               precision    recall  f1-score   support

           0       1.00      0.33      0.50        12
           1       0.27      1.00      0.43         3

    accuracy                           0.47        15
   macro avg       0.64      0.67      0.46        15
weighted avg       0.85      0.47      0.49        15



