In [5]:
!pip install translate
!pip install langdetect

Collecting translate
  Downloading translate-3.6.1-py2.py3-none-any.whl (12 kB)
Collecting libretranslatepy==2.1.1 (from translate)
  Downloading libretranslatepy-2.1.1-py3-none-any.whl (3.2 kB)
Installing collected packages: libretranslatepy, translate
Successfully installed libretranslatepy-2.1.1 translate-3.6.1


In [6]:
import pandas as pd
import json

with open('/content/transcripts.json', 'r') as file:
    data = json.load(file)

# Convert the JSON data into a list of dictionaries
data_list = [{"ID": key, "Description": value} for key, value in data.items()]

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)
df

Unnamed: 0,ID,Description
0,2055,"During the visit, I examined Mr. Don Hicks, wh..."
1,291,"During the visit, I examined Tina Will, a 69-y..."
2,102,"D: Good morning Tommie, how can I help you tod..."
3,2966,"D: Good morning, Chris. I understand you've be..."
4,2438,"D: Hi Ernest, I understand you're here for a c..."
...,...,...
1996,1732,"D: ¡Buenos días, Erwin Thompson! Entiendo que ..."
1997,2193,"During the visit, I asked Ms. Diana Griffith a..."
1998,2833,"During the visit, I, the doctor, explained to ..."
1999,1221,"Durin' the vissit, Mildred Favero, aged 93, pr..."


In [7]:
from langdetect import detect  # Import the language detection library
from translate import Translator

# Function to translate text with error handling
def translate_text(text):
    try:
        translator = Translator(to_lang="en", from_lang="auto")
        translation = translator.translate(text)
        return translation
    except Exception as e:
        print(f"Error during translation: {e}")
        return text  # Return the original text for other errors

# Function to check if text is already in English
def is_english(text):
    try:
        detected_lang = detect(text)
        return detected_lang == "en"
    except Exception as e:
        print(f"Error during language detection: {e}")
        return False  # Assume it's not in English if there's an error

# Translate each description in the DataFrame (only if not already in English)
df['Translated_Description'] = df['Description'].apply(lambda text: text if is_english(text) else translate_text(text))

# Print the DataFrame with translated descriptions
print(df)


        ID                                        Description  \
0     2055  During the visit, I examined Mr. Don Hicks, wh...   
1      291  During the visit, I examined Tina Will, a 69-y...   
2      102  D: Good morning Tommie, how can I help you tod...   
3     2966  D: Good morning, Chris. I understand you've be...   
4     2438  D: Hi Ernest, I understand you're here for a c...   
...    ...                                                ...   
1996  1732  D: ¡Buenos días, Erwin Thompson! Entiendo que ...   
1997  2193  During the visit, I asked Ms. Diana Griffith a...   
1998  2833  During the visit, I, the doctor, explained to ...   
1999  1221  Durin' the vissit, Mildred Favero, aged 93, pr...   
2000  1284  D: Good morning, Mr. Burns. How can I help you...   

                                 Translated_Description  
0     During the visit, I examined Mr. Don Hicks, wh...  
1     During the visit, I examined Tina Will, a 69-y...  
2     D: Good morning Tommie, how can I help 

In [8]:
df.to_csv('transcripts_translated.csv',index=False)

In [9]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
import pandas as pd
import spacy
from transformers import pipeline

# Load an NLP model (e.g., spaCy)
nlp = spacy.load("en_core_web_sm")

# Load a GPT-based summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Load a GPT-based text generation model (for rephrasing)
text_generator = pipeline("text-generation", model="gpt2")

# Function to process and validate medical information
def process_medical_info(transcript):
    # Extract relevant information using spaCy or custom methods
    doc = nlp(transcript)
    extracted_info = {}  # Store extracted information here

    # Rephrase extracted information using the text generation model
    rephrased_info = text_generator(transcript, max_length=50, num_return_sequences=1)
    rephrased_info = rephrased_info[0]['generated_text']

    # Summarize the transcript
    transcript_list = [transcript]  # Convert the transcript to a list
    summary = summarizer(transcript_list, max_length=100, min_length=5, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = summary[0]['summary_text']

    # Additional validation and processing steps
    # ...

    return {
        "extracted_info": extracted_info,
        "rephrased_info": rephrased_info,
        "summary": summary
    }

# Process each transcript and store the results
processed_data = []

for transcript in df['Description']:
    result = process_medical_info(transcript)
    processed_data.append(result)

# Create a new DataFrame to store the processed data
processed_df = pd.DataFrame(processed_data)

# Save the processed data to a new JSON file
processed_df.to_json('/content/processed_transcripts.json', orient='records')


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Your max_length is set to 100, but your input_length is only 78. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_

In [None]:
processed_df