In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import xml.etree.ElementTree as ET
import os
import cv2
import tensorflow as tf
from wordcloud import WordCloud
import re
from collections import defaultdict
import itertools
from collections import Counter

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

In [None]:
%cd '/content/drive/MyDrive/COMP 576 Final Project'

In [None]:
dataset = pd.read_csv("Image-Report.csv")

In [None]:
dataset.drop('Unnamed: 0', inplace=True, axis=1)

In [None]:
dataset

In [None]:
print('Dataset Shape:', dataset.shape)

In [None]:
for index, row in dataset.iterrows():
  print(row['Image_path'])
  row['Image_path'] = 'Scanned Images/' + row['Image_path'] + '.png'

In [None]:
print('Number of Images:', dataset['Image_path'].nunique())

In [None]:
dataset = dataset.dropna(axis=0)

In [None]:
dataset.isnull().sum()

In [None]:
print(dataset.shape)

In [None]:
dataset.head(12)

In [None]:
mapping = defaultdict(int)
find = {}
for imgPath, findings in dataset.values:
  text = imgPath.split('-')
  text.pop(len(text)-1)
  newText = '-'.join(text)
  mapping[newText]+=1
  find[newText] = findings



In [None]:
def splitData(mapping):
    persons = list(mapping.keys())
    persons_train = persons[:3100]
    persons_cv = persons[3100:3200]
    persons_test = persons[3200:]
    return persons_train, persons_cv, persons_test

In [None]:
train, cv, test = splitData(mapping)

In [None]:
def mapImageId(images):

  mapDict = defaultdict(list)
  for id in images:
    for row in dataset['Image_path'].values:
      if id in row:
        mapDict[id].append(row)
  return mapDict

In [None]:
trainMap = mapImageId(train)
cvMap = mapImageId(cv)
testMap = mapImageId(test)

In [None]:
len(testMap), len(testMap)

In [None]:
count = 0
for name, totalNum in mapping.items():
  if totalNum == 1:
    count+=1

print(count)



In [None]:
def finalDataset(data):
  image1 = []
  image2 = []
  patientId = []
  report = []
  for id, l in data.items():
    if len(l) == 1:
      patientId.append(id)
      image1.append(l[0])
      image2.append(l[0])
      report.append(find[id])
    elif len(l) == 2:
      patientId.append(id)
      image1.append(l[0])
      image2.append(l[1])
      report.append(find[id])

  finalDataset = pd.DataFrame()
  finalDataset['Person_id'] = patientId
  finalDataset['Image1'] = image1
  finalDataset['Image2'] = image2
  finalDataset['Report'] = report
  return finalDataset



In [None]:
train = finalDataset(trainMap)
test = finalDataset(testMap)
cv = finalDataset(cvMap)

In [None]:
cv

In [None]:
cv['Report'][0]

## Text Cleaning

In [None]:
import string
import spacy
import nltk
from nltk.corpus import stopwords
import re
import contractions
#nltk.download('stopwords')
def textPreProcessing(text):
    lowerCase = text.lower()

    expanded = []
    for word in lowerCase.split():
        expanded.append(contractions.fix(word)) #expand contractions
    expandedWords = ' '.join(expanded)
    punctuations = string.punctuation
    newString = ""
    for char in punctuations:
      if char != '.':
        newString+=char
    punctuations = newString
    temp = ""
    for char in expandedWords:                      #Lowercase

        if char not in punctuations:            #Remove punctuations
            temp+=char
    noPunct = temp
    res = re.sub(' +', ' ', noPunct)            #Remove extra spaces
    temp = ""
    for char in res:
        if char.isnumeric() == True:
            continue
        else:
            temp+=char

    temp = re.sub(r'x*','',temp)
    mystring = temp.replace("'", "")
    return mystring


In [None]:
punctuations = string.punctuation
print(punctuations)
newString = ""
for char in punctuations:
  if char != '.':
    newString+=char

In [None]:
for index, row in train.iterrows():
  row['Report'] = textPreProcessing(row['Report'])

for index, row in test.iterrows():
  row['Report'] = textPreProcessing(row['Report'])

for index, row in cv.iterrows():
  row['Report'] = textPreProcessing(row['Report'])




#train['Report'] = textPreProcessing(train['Report'])
##test['Report'] = textPreProcessing(test['Report'])
#cv['Report'] = textPreProcessing(cv['Report'])

In [None]:
%pip install contractions

In [None]:
train['Report'][500]

In [None]:
l = [len(e.split()) for e in train['Report'].values]  # Number of words in each report

In [None]:
max(l)

In [None]:
w = WordCloud(height=1500, width=1500).generate(str(l))

In [None]:
plt.figure(figsize=(12,12))
plt.title('WordCloud of Reports')
plt.imshow(w)

In [None]:
for index, row in train.iterrows():
  row['Report'] = 'startseq' + ' ' + textPreProcessing(row['Report']) + ' ' + 'endseq'

for index, row in test.iterrows():
  row['Report'] = 'startseq' + ' ' + textPreProcessing(row['Report']) + ' ' + 'endseq'

for index, row in cv.iterrows():
  row['Report'] = 'startseq' + ' ' + textPreProcessing(row['Report']) + ' ' + 'endseq'

In [None]:
# save the cleaned data(STRUCTURED DATA)
train.to_csv('train_Data.csv', index=False)
test.to_csv('test_Data.csv', index=False)
cv.to_csv('cv_Data.csv', index=False)

In [None]:
train['Report'][0]

In [None]:
mapping