In [1]:
import warnings
warnings.filterwarnings("ignore")

In [1]:
import pandas as pd
import os
import pypdf
import json

# Task 1

## labeled data

In [2]:
folder = 'anno_data'

In [3]:
df_labeled = pd.read_csv(folder +'labeled_data.csv')

In [4]:
df_labeled.shape

(5000, 2)

## prediction data

In [5]:
df_prediction = pd.read_csv(folder + 'prediction_data.csv')

In [6]:
df_prediction.shape

(45000, 1)

In [7]:
df_prediction.head(3)

Unnamed: 0,id
0,440467
1,416422
2,11040


## read pdf files

In [8]:
# set up folder dirtory
folder = 'anno_data/'

In [11]:
# create an empty list to hold data from all PDF files
data_pdf = []
# set the directory containing the PDF files
pdf_dir = folder + 'pdf'
# Loop over all files in the directory that end with '.pdf'
for files in os.listdir(pdf_dir):
    if files.endswith('.pdf'):
        #Read the pdf data from the file and append it to the 'data' list
        with open(os.path.join(pdf_dir, files), 'rb') as inputfile:
            reader = pypdf.PdfReader(inputfile)
            page = reader.pages[0]
            data_file = page.extract_text()
            data_pdf.append(data_file)
# split each element in the data_pdf into two parts
split_list = [elem.split("\n", 1) for elem in data_pdf]

# create a datafream with two columns: "Id" and "Text"
df_pdf = pd.DataFrame(split_list, columns=["id", "text"])

# covert the "ID" column to integers
df_pdf["id"] = df_pdf["id"].str.strip('Report id:').astype(int)
df_pdf.head()

Unnamed: 0,id,text
0,124347,Information received by ********** indicated t...
1,86976,It was reported that transmitter failed error ...
2,419543,A non−healthcare professional reported that du...
3,258793,"The customer reported to olympus, the sheath o..."
4,101676,It was reported that an altitude alarm occurre...


In [14]:
print(data_file)

Report id: 17904
It was reported that a failure was observed during a
 planned preventive maintenance or recall remediation service event. There was
 no reported patient involvement.
5pCO1xvG i3Q1RGDH BYvMCATs ZIhArKQW CPitL2Zi P8ZPahs6 EmPaS4TE
 zs50gyBm F0K4ztT2 e6bCC5sa


In [16]:
print(split_list)

[['Report id: 124347', 'Information received by ********** indicated that the customer had a\n loss of communication issue between the insulin pump and the\n transmitter. The customer also reported a charging issue with the\n transmitter. No harm requiring medical intervention was reported. Troubleshooting was\n not performed. It is unknown whether the customer will continue\n or discontinue the use of the device, and the transmitter\n will not be returned for analysis.\nd4dJMKtZ fhBgpf4C taPBw274 GwJBDJ9g aDW4HiXz\n b6SBiuT1 NW3p3rNr aRxMFHEx zDLqXiEH CKL2PLKt'], ['Report id: 86976', 'It was reported that transmitter failed error occurred. Data was\n evaluated and the allegation was not confirmed. The probable cause\n could not be determined. No injury or medical intervention was\n reported.\nr52Mut8O H7uRBswP OiopyJzW PI3ITzgn RXL5wT6z 7yqUxQ2B BZaO8kkF VdpYdPqN XKhQmIEn bc80Q8Ox\n'], ['Report id: 419543', 'A non−healthcare professional reported that during the intraocular lens (iol)

In [12]:
for index, text in enumerate(df_pdf['text'][30:32]):
    print('Review %d:\n'%(index+1), text)

Review 1:
 A customer reported receiving erroneous glucose results from an abbott
 diabetes care device. The results when plotted on a parkes
 error grid fell into either the c, d, or e
 zone. There was no report of death, serious injury, or
 mistreatment associated with this event.
qXj7cWJ1 13lQrGxZ 0P95iND8 VTnVvFyo EHQXvG2K eoUqAc5m
 liTKFCUB wWkhMmV4 vDdHKxGL 30A8Aad1
Review 2:
 It was reported that the pump battery was depleting quickly.
 The customer.s blood glucose was not adversely impacted. Tandem technical
 support made multiple attempts to follow up with customer but
 was not able to.
jxhyzftv jmtprCXR 8mq5XbnU vAjhs0GB nzSwXp3k rz7z4Awq J4uFssHu
 Fn6PtEfM zjgBfIxm 4A2IzsvD


## read json file

In [8]:
# set the directory containing the JSON files
json_dir = folder + 'json'

# Create an empty list to hold the data from all the JSON files
data_json = []

# Loop over all files in the directory that ends with '.json'
for files in os.listdir(json_dir):
    if files.endswith('.json'):
        # Read the JSON data from the file and append it to the 'data' list
        with open(os.path.join(json_dir, files), encoding='cp1252') as inputfile:
                data_file = pd.read_json(inputfile)

                data_json.append(data_file)
                
#combine all the JSON data into a single DataFrame
df_json = pd.concat(data_json, ignore_index=True)
df_json.head()

Unnamed: 0,id,text
0,504666,It was reported that the programmer had a defe...
1,481650,It was reported that the customer experienced ...
2,287232,It was reported that the right ventricular (rv...
3,397279,The customer reported 37 false positive result...
4,219403,Information received by ********** indicated t...


## read txt file

In [10]:
# set up the directory containing txt file
txt_dir = folder + 'txt/reports.txt'
# read txt file into a dataframe
df_txt = pd.read_csv(txt_dir, delimiter='\|', engine='python')
# change the column name
df_txt = df_txt.rename(columns={'"id"': 'id', '"text"': 'text'})
# remove "" in the text
df_txt.text = df_txt.text.str.replace("\"","")
df_txt.head()

Unnamed: 0,id,text
0,250624,********** Received information that no com pu...
1,97993,The clinician reports the implant was inserted...
2,434954,Implant failed due to a failure to osseointegr...
3,418549,Implant failed due to a failure to osseointegr...
4,24121,It was reported when using the bd sedi-20 ther...


## read xlsx file

In [11]:
# set up the path containing xlsx file
xlsx_path = folder + 'xlsx/reports.xlsx'
# read xlsx file into a dataframe
df_xlsx = pd.read_excel(xlsx_path)

In [16]:
df_json.shape

(1000, 2)

# combine all files into a single dataframe

In [12]:
# combine all files into a single DataFrame
df = pd.concat([df_txt, df_json, df_pdf,df_xlsx], axis=0)
df.head()

Unnamed: 0,id,text
0,250624,********** Received information that no com pu...
1,97993,The clinician reports the implant was inserted...
2,434954,Implant failed due to a failure to osseointegr...
3,418549,Implant failed due to a failure to osseointegr...
4,24121,It was reported when using the bd sedi-20 ther...


In [13]:
df.shape

(50000, 2)

# export the final datafram including all data files to a csv file

In [None]:
df.to_csv('task1_xuan_ren_.csv', index=False)